<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e84326</article-id><article-id pub-id-type="doi">10.2196/84326</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Scalable Identification of Clinically Relevant Chronic Obstructive Pulmonary Disease Documents in Large-Scale Electronic Health Record Datasets With a Lightweight Natural Language Processing Model: Retrospective Cohort Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Al-Garadi</surname><given-names>Mohammed</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Davis</surname><given-names>Sharon E</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Matheny</surname><given-names>Michael E</given-names></name><degrees>MD, MS, MPH</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Westerman</surname><given-names>Dax</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Conger</surname><given-names>Adrienne K</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Richmond</surname><given-names>Bradley W</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lasko</surname><given-names>Thomas A</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ricket</surname><given-names>Iben M</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Paulin</surname><given-names>Laura M</given-names></name><degrees>MD, MHS</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Brown</surname><given-names>Jeremiah R</given-names></name><degrees>MS, PhD</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Reeves</surname><given-names>Ruth M</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biomedical Informatics, Vanderbilt University Medical Center</institution><addr-line>2525 West End Ave, Suite 1475</addr-line><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Medicine, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Biostatistics, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff4"><institution>Tennessee Valley Healthcare System, Department of Veterans Affairs</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff5"><institution>Division of Allergy, Pulmonology, and Critical Care Medicine, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Computer Science, Vanderbilt University</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff7"><institution>Department of Epidemiology, Dartmouth Geisel School of Medicine</institution><addr-line>Lebanon</addr-line><addr-line>NH</addr-line><country>United States</country></aff><aff id="aff8"><institution>Department of Pulmonology, Dartmouth Hitchcock Medical Center</institution><addr-line>Lebanon</addr-line><addr-line>NH</addr-line><country>United States</country></aff><aff id="aff9"><institution>Center for Implementation Science, Dartmouth Geisel School of Medicine</institution><addr-line>Lebanon</addr-line><addr-line>NH</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Al-Agil</surname><given-names>Mohammad</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Ying</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mohammed Al-Garadi, PhD, Department of Biomedical Informatics, Vanderbilt University Medical Center, 2525 West End Ave, Suite 1475, Nashville, TN, 37203, United States, 1 (615) 936-6867; <email>mohammed.a.al-garadi@vumc.org</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>12</day><month>5</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e84326</elocation-id><history><date date-type="received"><day>17</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>17</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>31</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Mohammed Al-Garadi, Sharon E Davis, Michael E Matheny, Dax Westerman, Adrienne K Conger, Bradley W Richmond, Thomas A Lasko, Iben M Ricket, Laura M Paulin, Jeremiah R Brown, Ruth M Reeves. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 12.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e84326"/><abstract><sec><title>Background</title><p>The widespread adoption of electronic health records has resulted in the generation of large volumes of clinical notes. Learning algorithms and large language models can be trained on these resources, but they are susceptible to noise&#x2014;irrelevant or noninformative data. This sensitivity can lead to significant challenges, including performance degradation and the generation of inaccurate predictions or &#x201C;hallucinations.&#x201D; This study addresses a critical challenge in clinical informatics: efficiently filtering millions of documents for relevance before advanced language model processing, particularly in resource-constrained environments.</p></sec><sec><title>Objective</title><p>We present a novel framework for determining document relevance in clinical settings using a chronic obstructive pulmonary disease (COPD) dataset.</p></sec><sec sec-type="methods"><title>Methods</title><p>We developed a novel framework using weak supervision and domain-expert heuristics to generate &#x201C;silver standard&#x201D; labels for training data and gold standard expert-annotated labels, creating 2 datasets to optimize the model during the development phase and subsequent testing phase. Various text representation techniques (bag of words, term frequency&#x2013;inverse document frequency, lightweight document embeddings, compression-based features, and Unified Medical Language System concept extraction) were evaluated. These representations were used to train random forest, extreme gradient boosting, and k-nearest neighbor classifiers. Models were optimized on a small expert-annotated dataset and evaluated on a held-out test set.</p></sec><sec sec-type="results"><title>Results</title><p>The combination of lightweight document embedding with a random forest classifier demonstrated the best performance, achieving a precision of 0.73, recall of 0.86, and <italic>F</italic><sub>1</sub>-score of 0.80 (95% CI 0.76-0.87) for identifying relevant COPD documents. This significantly outperformed baseline heuristics (precision=0.70; recall=0.38; <italic>F</italic><sub>1</sub>-score=0.50, 95% CI 0.43-0.56) and other tested methods.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our study presents a novel framework for identifying COPD-relevant clinical documents using lightweight embedding and machine learning. This approach effectively filters pertinent documents, enhancing information retrieval precision. The framework&#x2019;s scalability and minimal annotation needs make it promising for diverse health care applications, potentially optimizing clinical outcomes through efficient document selection for data-driven decision support systems.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>chronic obstructive pulmonary disease</kwd><kwd>electronic health record</kwd><kwd>machine learning</kwd><kwd>data mining</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Chronic obstructive pulmonary disease (COPD) is a leading cause of mortality in the United States, impacting an estimated 24 million people nationwide [<xref ref-type="bibr" rid="ref1">1</xref>]. COPD exacerbation&#x2013;related hospitalizations also incur substantial health care costs, ranging from US $7000 to US $39,200 per hospitalization [<xref ref-type="bibr" rid="ref2">2</xref>]. Overall, COPD-attributable care amounted to US $49 billion in 2020, with hospitalization comprising approximately half of this total cost [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Artificial intelligence (AI) is transforming health care by enhancing the analysis, interpretation, and prediction of medical data. This technological advancement supports clinical decision-making and enables more efficient allocation of limited resources for chronic disease management, particularly for high-risk patients [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. While narrative clinical text is information rich, its unstructured nature poses challenges for direct use in AI-driven clinical decision support systems. To address this, information extraction and representation methods have emerged, leveraging large language models (LLMs) and deep learning techniques [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. These approaches aim to transform raw clinical narratives into structured, machine-readable data. However, significant challenges persist when processing vast document collections. Information relevant to a specific task may be dispersed across only a small fraction of the total corpus, making it inefficient to apply computationally intense LLMs to the entire dataset [<xref ref-type="bibr" rid="ref11">11</xref>]. Sophisticated preprocessing, filtering, and prioritization algorithms can identify and extract relevant documents, reducing the volume of text that needs to be processed at computationally intense stages of the pipeline [<xref ref-type="bibr" rid="ref12">12</xref>].</p></sec><sec id="s1-2"><title>Weak Supervision for Developing Automated Natural Language Processing Classification Models</title><p>Weak supervision is a machine learning (ML) approach that mitigates the challenges of creating large, annotated datasets for natural language processing (NLP). It involves using lower-accuracy or less detailed programmatically generated labels and limited labeled data in semisupervised learning [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. This approach reduces the need for extensive manual labeling, speeding up the ML process and cutting costs. This method offers several advantages, including reduced annotation costs, improved scalability, and enhanced domain adaptability [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. Weak supervision allows for the incorporation of domain expertise, supports iterative refinement, and enables model development in low-resource scenarios [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. By minimizing manual labeling efforts, it facilitates the efficient creation of large-scale, domain-specific datasets, making it particularly valuable for developing robust NLP models across diverse fields [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Several recent studies have demonstrated the effectiveness of weak supervision in medical NLP tasks. Wang et al [<xref ref-type="bibr" rid="ref19">19</xref>] combined this approach with pretrained word embeddings to create a rule-based NLP algorithm for the automatic generation of training labels. They evaluated the effectiveness of these auto-generated labels across 4 supervised learning architectures: support vector machine, random forest (RF), fully connected networks, and convolutional neural networks. Their comparative analysis demonstrated the versatility and efficacy of this auto-labeling technique as the generated labels performed successfully across diverse model frameworks [<xref ref-type="bibr" rid="ref19">19</xref>]. Similarly, Cusick et al [<xref ref-type="bibr" rid="ref18">18</xref>] applied weak supervision to address the challenge of exhaustive manual labeling in clinical contexts. Building on a previous study that used a rule-based NLP system to automatically label the clinical notes of 600 patients with potential suicidal ideation [<xref ref-type="bibr" rid="ref18">18</xref>], they efficiently generated a sizable training dataset. This dataset was then used to train various statistical ML models and a convolutional neural network. Their research highlighted the potential of combining weak supervision and deep learning to enhance real-time clinical systems and facilitate research on suicidal ideation progression through automated analysis of clinical text [<xref ref-type="bibr" rid="ref18">18</xref>]. In another study in this field, Fries et al [<xref ref-type="bibr" rid="ref16">16</xref>] introduced Trove. This framework for weakly supervised medical entity classification leverages existing medical ontologies such as the Unified Medical Language System (UMLS) as a source of reusable, automated labeling heuristics. Trove&#x2019;s key innovation is its use of a label model to learn the accuracies of individual ontologies and correct for label noise when combining multiple sources. The authors demonstrated a weakly supervised performance on named entity recognition tasks for chemicals, diseases, and drugs that achieved results within 1.3 to 4.9 <italic>F</italic><sub>1</sub>-score points of fully supervised models.</p><p>These studies underscore the growing importance of weak supervision in overcoming the limitations of traditional manual annotation methods in clinical NLP tasks. By enabling the efficient creation of large-scale, domain-specific datasets, weak supervision is proving to be a valuable approach for developing robust NLP models across diverse clinical applications. This method offers a simplified way to auto-generate training labels, significantly reducing the need for manual annotation [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>Our proposed method uniquely integrates high-accuracy expert labels with weak supervision, offering a more efficient and accurate clinical document classification solution that overcomes traditional techniques&#x2019; limitations. In the following section, we will describe the rationale, significance, and unique contributions of our proposed approach, demonstrating how it advances clinical NLP methods and addresses critical needs in the field.</p></sec><sec id="s1-3"><title>Rationale, Importance, and Contributions</title><p>NLP models designed for categorizing clinical documents can support various downstream AI or ML tasks in health care, such as risk prediction, clinical summarization, and probabilistic phenotyping [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. The recent introduction of LLMs has created tremendous potential for health care to leverage these powerful tools. However, the bottleneck lies in the vast number of documents that these computationally intensive models need to process, which can reduce their efficiency and effectiveness. Lightweight NLP models solve this challenge by efficiently filtering documents and providing appropriate data inputs for more sophisticated models such as LLMs [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. These lightweight models could potentially identify and select relevant documents, which may enhance the accuracy and efficiency of the overall AI or ML pipeline. However, the effectiveness of these models depends on input data quality. <italic>International Classification of Diseases</italic> (<italic>ICD</italic>) codes, traditionally used to identify relevant documents, have proven ineffective [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. This observation underscores the limitations of relying exclusively on <italic>ICD</italic> codes for document selection in this context. The findings suggest the necessity for more refined and thorough identification methods to ensure the accurate capture of relevant COPD-related documents.</p><p>The potential effectiveness of these lightweight NLP models could stem from their ability to accurately identify and filter out irrelevant documents while potentially maintaining efficiency in computing power, memory use, and processing time. This efficiency is especially advantageous for resource-constrained health care systems, where simpler ML approaches that require significantly less computational power are often preferred over advanced NLP techniques [<xref ref-type="bibr" rid="ref28">28</xref>] such as LLMs.</p><p>By leveraging heuristic training data and a small expert-annotated dataset, these models can be optimized with minimal labeled data, significantly reducing the manual annotation burden. This approach addresses the persistent challenge of limited annotated data in clinical settings, potentially accelerating the development and deployment of AI-driven health care solutions [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>Our study makes 3 primary contributions to the field of clinical document classification. First, we introduce a novel framework that uniquely integrates high-accuracy expert labels with weak supervision for hyperparameter tuning, thus designing lightweight ML classifiers. This approach offers a more efficient and accurate solution for categorizing COPD-related documents as either &#x201C;relevant&#x201D; or &#x201C;nonrelevant,&#x201D; overcoming the limitations of traditional techniques. Second, we leveraged heuristic training data formulated by domain-expert clinicians to construct a training dataset without the need for extensive manual annotation. By using these expert hypotheses to generate &#x201C;silver-standard&#x201D; labels [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], we substantially mitigated the challenges commonly associated with manual annotation. Third, we conducted extensive experiments with diverse text representation techniques, rigorously evaluating the relative performance of different pipelines for identifying relevant documents while minimizing the inclusion of irrelevant ones.</p><p>Although our study focused on patients with COPD, the techniques we developed are designed to be adaptable across various clinical domains. Our approach of combining a small number of high-accuracy expert labels for hyperparameter tuning with a larger set of lower-accuracy labels for model training allows for potential improvements in model performance that may not be achievable using only lower-accuracy labels.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>We constructed a retrospective observational cohort of patients with COPD attending Vanderbilt University Medical Center (VUMC) between January 1, 2012, and December 31, 2020. A series of NLP pipelines of all clinical narrative notes from this cohort were used to perform data evaluation. The objective of this study was to create an AI or ML pipeline using weak supervision with a small set of annotated data that could effectively determine the relevance of the notes for future AI or ML tasks for a clinical condition of interest (COPD). Our cohort inclusion criteria were patients with a diagnosis of COPD at any point during the study period and who were aged 40 years or older at the time of their diagnosis. COPD was defined based on administrative codes available from the electronic health record (EHR) for the <italic>ICD, Ninth Revision, Clinical Modification</italic> (491, 491.1, 491.2, 491.21, 491.22, 491.8, 491.9, 492, 492.8, 493.92, and 496) or <italic>ICD, Tenth Revision, Clinical Modification</italic> (J41.0, J41.1, J41.8, J42, J43.1, J43.2, J43.8, J43.9, J44.0, J44.1, and J44.9).</p><p><xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates our NLP framework for classifying clinical notes related to COPD, including heuristic data creation, text representation, model training, optimization, and evaluation. Our heuristic training data, or &#x201C;silver standard,&#x201D; consisted of 20,000 clinical notes. We created this dataset using expert-defined rules: notes from encounters with COPD diagnosis codes were labeled as informative (positive), whereas those from encounters without COPD diagnosis codes or from 24 months or more prior to initial COPD diagnosis were labeled as noninformative (negative). This heuristic approach enabled the creation of a large training dataset without manual annotation, leveraging domain expertise to generate plausible labels. We used various text representation techniques to transform the free text into numerical features, including bag of words (BoW), term frequency&#x2013;inverse document frequency (TF-IDF), lightweight document embedding, compression-based embeddings, and UMLS concept extraction. These representations fed into 3 ML classifiers: RF [<xref ref-type="bibr" rid="ref32">32</xref>], k-nearest neighbor (KNN) [<xref ref-type="bibr" rid="ref33">33</xref>], and extreme gradient boosting (XGBoost) [<xref ref-type="bibr" rid="ref34">34</xref>]. For model optimization and evaluation, we used a separate gold-standard dataset manually annotated by clinical experts. We used a subset of these data for hyperparameter optimization, allowing us to fine-tune our models based on high-quality, expert-labeled data, and a held-out test set for final model assessment. By leveraging a large heuristically labeled dataset for training and then optimizing and testing with a carefully curated gold standard, this approach integrates heuristic-based training data creation, diverse text representation techniques, and a 2-stage evaluation process using expert-annotated data. The following subsections will explain each block in more detail.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Framework for developing an effective lightweight classifier for clinical documents. BoW: bag of words; COPD: chronic obstructive pulmonary disease; KNN: k-nearest neighbor; RF: random forest; TF-IDF: term frequency&#x2013;inverse document frequency; UMLS: Unified Medical Language System; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e84326_fig01.png"/></fig></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was approved by the VUMC Institutional Review Board (IRB #210139) and received a waiver of informed consent as retrospective secondary use of routinely collected data. Data were stored and analyzed on secure, encrypted on-premises servers with restricted access. All study team members with access to data were trained in human participants research ethics. No participant compensation was provided. The manuscript and supplementary materials present summary information only and contain no identifiable patient information.</p></sec><sec id="s2-3"><title>Data Collection and Annotation</title><sec id="s2-3-1"><title>Overview</title><p>This subsection details our data collection and labeling methods for identifying COPD-relevant clinical notes. We used a 2-tiered strategy to build our training and test datasets: heuristic-based labeling for large-scale training (silver standard) and manual expert annotation for precise model optimization and testing (gold standard). This method effectively addresses the challenge in medical NLP of creating large amounts of reliable training data while ensuring thorough model evaluation.</p><p>We compiled a corpus of 10 million clinical notes extracted from 32 million raw notes collected over a decade (study period). From this collection, we developed our training and test datasets.</p></sec><sec id="s2-3-2"><title>Heuristic Training Dataset (Silver Standard)</title><p>The training dataset consisted of positive and negative examples, identified using <italic>ICD</italic> code heuristics as follows:</p><list list-type="order"><list-item><p>The COPD encounter corpus (positive labels) consisted of 10,000 notes from encounters <italic>ICD</italic>-coded as COPD, likely containing COPD-relevant information.</p></list-item><list-item><p>The non-COPD encounter corpus (negative labels) consisted of 10,000 notes from patient encounters without any COPD diagnosis code, indicating content likely irrelevant to COPD.</p></list-item><list-item><p>The pre-COPD encounter corpus (negative labels) consisted of 10,000 notes from patient encounters occurring 24 months or more prior to the initial COPD diagnosis; this approach aimed to avoid including notes with early signs of COPD.</p></list-item></list><p>We constructed 3 training subsets to evaluate different heuristic combinations and their impact on classifier accuracy. Each subset contained 5000 positive examples from (1) and 5000 negative examples. The negative examples varied across subsets: the first used a mix of 2500 each from (2) and (3), the second used 5000 from (2) only, and the third used 5000 from (3) only.</p><p>This design allowed us to assess which combination produced the most accurate classifier. We ensured diversity in patient demographics, timing, and note types across all samples.</p></sec><sec id="s2-3-3"><title>Test Data (Gold Standard)</title><p>Clinical subject matter experts developed an annotation guideline to determine the relevance of documents for COPD diagnosis and exacerbation prediction. The guideline was iteratively refined through consultations with clinical experts to ensure consistent labeling criteria. Using this guideline, we created a reference standard for evaluation by randomly selecting 307 documents from the positive and negative training datasets, including 107 COPD-coded notes, 100 non-COPD encounter notes, and 100 pre-COPD encounter notes. Three clinicians independently reviewed and annotated these documents, labeling them as either relevant or irrelevant for COPD exacerbation prediction. Disagreements were resolved through adjudication among the reviewers to produce the final consensus labels. The annotated documents were then divided into a validation set used for model optimization during development and a held-out test set used for final evaluation. Specifically, 84 documents were used for validation, whereas 223 documents were reserved as the independent test set. The final test set size reflected the adjudicated label set after reconciliation of annotations among the 3 reviewing clinicians.</p></sec></sec><sec id="s2-4"><title>NLP Model Development</title><sec id="s2-4-1"><title>Overview</title><p>The development of the NLP pipeline for classifying clinical documents related to COPD diagnosis involved several key stages. Initially, various NLP techniques were applied to transform clinical text into informative numerical representations. These numerical representations served as input for ML models, which were subsequently optimized to enhance performance. Finally, the fully optimized models were assessed against a held-out test set to measure their effectiveness. The following subsections will explain in detail the different text representation methods and ML approaches and the processes of training, optimization, and testing the models.</p></sec><sec id="s2-4-2"><title>Text Representation</title><p>This step involved exploring various text representation methods, including BoW, TF-IDF, lightweight document embedding, and lossless compression representation.</p><p>BoW is a text representation technique that treats each document as a collection of words, disregarding their order. It creates a vector where each element corresponds to the frequency of a word in the document, ignoring context [<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>TF-IDF is a text analysis method that evaluates the importance of words in a document within a larger corpus. It calculates a score for each word based on its frequency in the document and its rarity across all documents, helping identify key terms [<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>Lightweight document embedding [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>] is a modern text representation approach that uses pretrained language models such as Word2Vec or FastText to convert words or phrases into dense, continuous-valued vectors. These embeddings capture semantic relationships between words and are useful for various NLP tasks. Using distributed memory and distributed BoW approaches for enhancing paragraph and document embeddings [<xref ref-type="bibr" rid="ref38">38</xref>], we incorporated the hierarchical softmax algorithm [<xref ref-type="bibr" rid="ref39">39</xref>].</p><p>Compression representation uses lossless compression techniques. The key idea is to leverage compressors to efficiently represent text information as they excel at capturing regularity. Normalized compression distance [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>] quantifies shared information between text objects by comparing their compressed lengths, making it suitable for classification. Using gzip as the compressor, this lightweight and universal method calculates normalized compression distances between testing and training data for KNN classification, offering a simple and resource-efficient alternative to deep neural networks for text classification tasks. This lightweight methodology is readily applicable to extensive text and has demonstrated better performance compared to the bidirectional encoder representations from transformers model [<xref ref-type="bibr" rid="ref41">41</xref>]. While this approach has been tested on general texts such as news articles and Yahoo Answers, we extended its use by applying it to clinical documents. However, ML classifiers such as RF and XGBoost do not natively use compression distance metrics as features. Thus, we transformed raw text via compression into the gzip format. The length of the compressed data then served as the key predictive feature for RF and XGBoost models.</p><p>UMLS text representation is a comprehensive resource developed by the National Library of Medicine to facilitate the integration of biomedical information systems [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. One of its key components is the representation of biomedical concepts and their relationships through various text formats. In UMLS, each concept is assigned a unique identifier called a Concept Unique Identifier (CUI). CUIs are alphanumeric codes that remain stable across different versions of the UMLS, serving as a means to identify and link concepts across multiple vocabularies and ontologies. For example, the CUI C0018787 represents the concept of &#x201C;heart&#x201D; in the UMLS. In this representation, text documents were encoded as vectors where each element corresponded to a unique CUI. The presence of a CUI was indicated by a nonzero value, which also encoded contextual information such as assertion status (eg, positive or negative). This approach combines elements of one-hot encoding with additional semantic information, allowing for a more nuanced representation of the document&#x2019;s content.</p></sec><sec id="s2-4-3"><title>ML Training, Optimization, and Evaluation Metrics</title><p>This study used a rigorous methodology to train, optimize, and compare 3 ML algorithms: RF, XGBoost, and KNN. We used a weakly supervised learning approach, training models on silver-labeled documents to learn discriminative features for identifying clinically relevant notes without extensive manual annotation. While not perfect, this silver-labeled dataset provided a valuable starting point for model training.</p><p>The optimization process started with hyperparameter tuning through grid search, systematically examining key parameters for each model. For RF, we considered the number of trees, maximum depth, and minimum sample split. XGBoost optimization focused on learning rate, maximum depth, subsample, and number of boosting rounds. For KNN, we tuned the number of neighbors and distance metric. Five-fold cross-validation assessed each hyperparameter combination, with the <italic>F</italic><sub>1</sub>-score serving as the primary metric due to its capacity to balance precision and recall.</p><p>To refine our approach, we leveraged a small set of clinician-annotated evaluation data. This allowed for better fine-tuning of the models to distinguish document relevance, balancing the primary aim of maximizing recall (to ensure identification of as many truly relevant documents as possible) with maintaining precision (to avoid excessive misclassification of irrelevant documents). We proceeded to fine-tune the models by adjusting classification thresholds. Probability predictions were generated on a validation dataset, with thresholds ranging from 0.0 to 1.0 in 0.1 increments. Precision, recall, and <italic>F</italic><sub>1</sub>-score were calculated at each threshold, facilitating the identification of an optimal threshold that maximized the <italic>F</italic><sub>1</sub>-score while maintaining an appropriate balance between precision and recall. The final evaluation was conducted on a held-out test set, focusing primarily on key metrics: recall (sensitivity), precision (positive predictive value), specificity, and <italic>F</italic><sub>1</sub>-score for the relevant class. Recall measures the fraction of total relevant documents correctly classified, whereas precision assesses the accuracy in identifying only relevant documents, minimizing false positives [<xref ref-type="bibr" rid="ref44">44</xref>]. Specificity evaluates the system&#x2019;s ability to correctly identify nonrelevant documents, providing a comprehensive picture of classification performance. The <italic>F</italic><sub>1</sub>-score, as the harmonic mean of precision and recall, offers a combined measure of classification accuracy [<xref ref-type="bibr" rid="ref44">44</xref>]. Detailed performance metrics at various classification thresholds for the best-performing model are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, offering deeper insights into model behavior across diverse scenarios. The 95% CIs for <italic>F</italic><sub>1</sub>-scores were computed using bootstrap resampling (1000 iterations) on the held-out test set, providing robust uncertainty estimates for all reported performance metrics.</p></sec></sec><sec id="s2-5"><title>Experiment Design</title><sec id="s2-5-1"><title>Baseline</title><p>The baseline classification for COPD-related documents used expert-defined heuristics based on <italic>ICD</italic> codes and the temporal relevance of notes. These heuristics were evaluated against a manually annotated gold-standard test dataset, which included examples from each category (COPD coded, non-COPD coded, and pre-COPD temporal). This evaluation served 2 purposes: first, to assess the effectiveness of these simple rules in identifying COPD-relevant notes compared to clinician annotations and, second, to establish a baseline for comparison with ML approaches. The subsequent ML models were trained on the heuristically labeled data, optimized using a small annotated dataset, and then tested on the same gold-standard test data. This process determined whether ML could optimize the classification process while requiring only a small annotated dataset for fine-tuning.</p></sec><sec id="s2-5-2"><title>Experiment Setup</title><p>We experimented with 3 classifiers&#x2014;RF, XGBoost, and KNN&#x2014;using BoW representations as these models are known to effectively handle various data types [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Following the same standardized procedure, we also experimented with the ML models using other text representations: TF-IDF, lightweight document embeddings, and compression-based features.</p></sec><sec id="s2-5-3"><title>Hyperparameter Tuning</title><p>Hyperparameter tuning was conducted for the RF, XGBoost, and KNN classifiers to optimize their performance on the text classification task. For RF, we explored the number of trees (100-500), maximum depth (10-30 and none), minimum samples for split and leaf nodes (2-10 and 1-4, respectively), feature selection strategies for node splitting, and class weight options. XGBoost tuning focused on learning rate (0.01-0.3), number of estimators (100-1000), maximum depth (3-10), minimum child weight (1-6), subsampling ratio (0.5-1.0), and column sampling by tree (0.5-1.0). For KNN, we varied the number of neighbors (1-20), weighting function (uniform and distance), distance metrics (Euclidean, Manhattan, and Minkowski), and leaf size (10-50). Grid search with 5-fold cross-validation was used for all models using multiple scoring metrics (<italic>F</italic><sub>1</sub>-score, precision, recall, and specificity) to evaluate performance. This approach allowed for the selection of optimal hyperparameter combinations that balanced model complexity, computational efficiency, and generalization performance tailored to the specific challenges of text classification.</p></sec><sec id="s2-5-4"><title>Threshold Analysis</title><p>In our case, optimizing recall was crucial, particularly because the cost of false negatives was high. While classification models often default to a 0.5 threshold for converting probability predictions into class labels, it was possible that this threshold would not be ideal for our needs. Our threshold analysis evaluated model performance across a range of thresholds, from 0.0 to 1.0 in increments of 0.1. At each threshold, we calculated precision, recall, specificity, and the <italic>F</italic><sub>1</sub>-score. This analysis allowed us to fine-tune the model&#x2019;s decision boundary to maximize recall while maintaining excellent precision, ensuring that we identified as many relevant cases as possible without significantly increasing the number of false positives.</p><p>For all experiments, models were optimized on a separate validation set to find the optimal hyperparameters that maximized recall as accurately identifying all relevant documents was crucial. Instead of relying solely on the default threshold, we iteratively adjusted the recall threshold to find the optimal cutoff for each model. The optimized parameters based on this evaluation were then used in the final models and applied to the held-out test set. By adhering to this consistent methodology, we were able to fairly assess the performance of different classifiers and text representations.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Baseline Performance</title><p>The defined <italic>ICD</italic>-based heuristics were evaluated against manually annotated gold-standard test data. <xref ref-type="table" rid="table1">Table 1</xref> shows the baseline model&#x2019;s performance in classifying COPD-related documents. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the baseline confusion matrix on the test data. The heuristics correctly identified 50 irrelevant and 57 relevant documents while misclassifying 24 irrelevant and 92 relevant documents.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>The baseline model&#x2019;s performance in classifying chronic obstructive pulmonary disease&#x2013;related documents.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Precision (PPV<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>)</td><td align="left" valign="bottom">Recall (sensitivity)</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Baseline model</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.38</td><td align="left" valign="top">0.68</td><td align="left" valign="top">0.50 (0.43-0.56)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>PPV: positive predictive value.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Baseline confusion matrix on the test data.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e84326_fig02.png"/></fig></sec><sec id="s3-2"><title>ML Framework Performance</title><p>The results validate the effectiveness of our framework, which used various text representation techniques and ML models to eliminate irrelevant documents and prioritize information in relevant ones. We found that combining both negative cases (non-COPD coded and pre-COPD temporal) consistently produced the best model across all representation techniques. As shown in Tables S1 to S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, the mixed-negative configuration yielded the highest <italic>F</italic><sub>1</sub>-score across most text representations, supporting this choice as the primary training strategy. Tables S1 to S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provide comprehensive results for all combinations, but here, we focus on the results obtained by training the model on the combined dataset using both negative heuristics.</p><p><xref ref-type="table" rid="table2">Table 2</xref> presents a comparative analysis of various ML models (RF, XGBoost, and KNN) across different document representations. The RF model consistently demonstrated superior performance with balanced precision, recall, and <italic>F</italic><sub>1</sub>-score for the relevant class when using BoW representation. In the TF-IDF representation, KNN showed high precision but lower recall, whereas RF provided a more balanced performance. With lightweight document embeddings, RF achieved high recall but lower precision for the irrelevant class, indicating a trade-off. For compression-based representations, RF and XGBoost performed comparably, with KNN achieving a higher <italic>F</italic><sub>1</sub>-score. Finally, in the UMLS representation, XGBoost marginally outperformed RF in precision and <italic>F</italic><sub>1</sub>-score, whereas KNN showed high precision but lower recall.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance comparison of classification models on the chronic obstructive pulmonary disease dataset using various representations.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Representation and model</td><td align="left" valign="bottom">Precision (PPV<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>)</td><td align="left" valign="bottom">Recall (sensitivity)</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">BoW<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.74 (0.7-0.78)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.54</td><td align="left" valign="top">0.71 (0.65-0.76)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>KNN<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.47</td><td align="left" valign="top">0.62</td><td align="left" valign="top">0.57 (0.53-0.62)</td></tr><tr><td align="left" valign="top" colspan="5">TF-IDF<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.69</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.73 (0.68-0.78)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.64</td><td align="left" valign="top">0.46</td><td align="left" valign="top">0.67 (0.61-0.71)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>KNN</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.37</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.51 (0.46-0.55)</td></tr><tr><td align="left" valign="top" colspan="5">Lightweight document embedding</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF</td><td align="left" valign="top">0.73</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.36</td><td align="left" valign="top">0.80 (0.76-0.87)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.73</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.45</td><td align="left" valign="top">0.74 (0.71-0.77)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>KNN</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.32</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.47 (0.43-0.52)</td></tr><tr><td align="left" valign="top" colspan="5">Compression based</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.45</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.57 (0.52-0.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.45</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.57 (0.54-0.62)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>KNN</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.64</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.69 (0.63-0.74)</td></tr><tr><td align="left" valign="top" colspan="5">UMLS<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.55</td><td align="left" valign="top">0.64</td><td align="left" valign="top">0.63 (0.58-0.69)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.56</td><td align="left" valign="top">0.46</td><td align="left" valign="top">0.65 (0.6-0.68)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>KNN</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.30</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.44 (0.38-0.51)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>PPV: positive predictive value.</p></fn><fn id="table2fn2"><p><sup>b</sup>BoW: bag of words.</p></fn><fn id="table2fn3"><p><sup>c</sup>RF: random forest.</p></fn><fn id="table2fn4"><p><sup>d</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table2fn5"><p><sup>e</sup>KNN: k-nearest neighbor.</p></fn><fn id="table2fn6"><p><sup>f</sup>TF-IDF: term frequency&#x2013;inverse document frequency.</p></fn><fn id="table2fn7"><p><sup>g</sup>UMLS: Unified Medical Language System.</p></fn></table-wrap-foot></table-wrap><p>Lightweight document embedding combined with the RF model was the most effective method for extracting COPD-relevant documents. This approach achieved an <italic>F</italic><sub>1</sub>-score of 0.80 (95% CI 0.76-0.87) for the relevant class, outperforming other representation techniques examined in this study.</p></sec><sec id="s3-3"><title>Best-Performing Model Comparison and Threshold Analysis</title><p>The baseline model confusion matrix (<xref ref-type="fig" rid="figure2">Figure 2</xref>) correctly identified 50 irrelevant and 57 relevant documents, with 24 irrelevant and 92 relevant documents misclassified. The best-performing NLP pipeline confusion matrix (<xref ref-type="fig" rid="figure3">Figure 3</xref>) correctly identified 27 irrelevant and 128 relevant documents, with 47 irrelevant and 21 relevant documents misclassified. The best-performing model significantly improved relevant document identification (128 vs 57) and reduced relevant document misclassification (21 vs 92).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Best-performing natural language processing pipeline confusion matrix.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e84326_fig03.png"/></fig><p>Although the best-performing NLP pipeline had a higher irrelevant document misclassification rate (as highlighted by low specificity), this trade-off was acceptable given the optimization for sensitivity, which was the primary goal. The detailed results for the best-performing classifier at various cutoff points are shown in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The figure illustrates the relationship among precision, sensitivity, and specificity across different thresholds for the classification model.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our study demonstrated the effectiveness of lightweight ML models, particularly an RF classifier with lightweight document embeddings, in extracting COPD-relevant documents from EHRs. This approach significantly outperformed baseline heuristic methods, achieving high recall (0.86) and <italic>F</italic><sub>1</sub>-score (0.80, 95% CI 0.76&#x2010;0.87) for relevant documents. These findings represent a significant advancement in efficient EHR processing, offering a scalable solution for filtering large volumes of clinical documents in resource-constrained environments. By improving the identification of relevant information, this approach has the potential to enhance the performance of clinical decision support systems and optimize resource use in health care applications.</p></sec><sec id="s4-2"><title>ML Optimization Over Heuristic Approaches</title><p>Traditional heuristic approaches, such as relying on <italic>ICD</italic> codes for document filtering, have proven inadequate in accurately capturing relevant clinical information. Our findings confirm that a substantial proportion of documents with COPD <italic>ICD</italic> codes lack significant relevant information (as illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>), highlighting the limitations of such methods. In contrast, our ML framework effectively distinguished relevant from irrelevant documents (as illustrated in <xref ref-type="fig" rid="figure3">Figure 3</xref>), thereby improving the sensitivity and identification of relevant documents. ML models outperformed baseline approaches by optimizing small annotations, learning deeper patterns, adapting flexibly, generalizing better from imperfect examples, handling noisy data robustly, and using advanced feature extraction techniques [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref47">47</xref>].</p></sec><sec id="s4-3"><title>Improvement and Effectiveness of the Model</title><p>A primary contribution of our work is demonstrating how lightweight models such as an RF classifier paired with lightweight document embeddings can achieve high performance with minimal annotation data. By using weak supervision, we generated &#x201C;silver-standard&#x201D; labels from domain-expert heuristics, significantly reducing the need for extensive manual annotation. This method not only streamlined the training process but also enhanced the model&#x2019;s scalability across different clinical domains.</p><p>Our methodology uniquely integrated a large corpus of weakly supervised silver-standard labels for initial training with a small set of high-fidelity gold-standard labels for hyperparameter optimization. This approach not only streamlined the training process but also enhanced model performance and generalizability across diverse clinical domains.</p><p>As shown in <xref ref-type="table" rid="table2">Table 2</xref>, among the evaluated text representation techniques, lightweight document embedding coupled with the RF classifier exhibited superior performance in classifying relevant COPD documents, achieving a recall of 0.86 and an <italic>F</italic><sub>1</sub>-score of 0.80 (95% CI 0.76-0.87) for the relevant class. This performance significantly surpassed the baseline model&#x2019;s <italic>F</italic><sub>1</sub>-score of 0.50 (95% CI 0.43-0.56), underscoring the efficacy of our approach compared to conventional heuristic methods.</p><p>The incorporation of a limited set of high-fidelity labels for hyperparameter tuning resulted in a substantial improvement in model performance. The model-agnostic nature of this approach rendered it applicable to any classifier requiring hyperparameter optimization.</p><p>Additionally, the results underscore the critical role of text representation in model performance. While both RF and XGBoost proved to be robust classifiers, their effectiveness was notably influenced by the choice of text representation technique. For instance, RF&#x2019;s <italic>F</italic><sub>1</sub>-score for the relevant class ranged from 0.57 (95% CI 0.52-0.60) with compression-based representation to 0.80 (95% CI 0.76-0.87) with lightweight document embedding.</p></sec><sec id="s4-4"><title>Limitations and Future Directions</title><p>While our study demonstrated the effectiveness of lightweight document embedding and ML models in extracting relevant COPD documents, several limitations should be considered. First, the dataset used in this study was specific to COPD, and the generalizability of our findings to other clinical conditions remains to be evaluated. Future research should examine the applicability of this framework across a broader range of diseases and clinical document types. Second, the data were derived from a single institution (VUMC), which may limit external validity due to variations in clinical documentation practices across health care systems. Multi-institutional validation using datasets from different health systems would help assess the robustness and portability of the proposed framework. Finally, while this study focused on binary classification of documents as relevant or irrelevant, future work could explore more granular categorization of clinical documents through multi-class classification (eg, exacerbation-related notes, diagnostic documentation, and treatment updates). Such extensions could further enhance the utility of this framework for downstream clinical decision support and LLM-based information extraction pipelines.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study presents a novel framework for extracting COPD-relevant clinical documents using lightweight document embedding and ML models. Our approach effectively identifies relevant documents while minimizing the identification of irrelevant ones, enhancing the quality of information for clinical decision support systems and improving patient outcomes. Future research should explore the generalizability of our findings to other chronic conditions and integrate this model into predictive analytics for greater efficiency and effectiveness. This framework can be adopted across diverse health care domains, contributing to improved patient care and clinical outcomes.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This study was funded by the National Heart, Lung, and Blood Institute (R01:HL157130). Data access through Vanderbilt University&#x2019;s Research Derivative was supported by Clinical and Translational Science Award UL1TR000445 from the National Center for Advancing Translational Sciences. The National Heart, Lung, and Blood Institute; National Center for Advancing Translational Sciences; and National Institutes of Health had no role in the design and conduct of the study; collection, management, analysis, and interpretation of the data; preparation, review, and approval of the manuscript; or decision to submit the manuscript for publication.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request and approval from Vanderbilt University Medical Center.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: MA-G, RMR</p><p>Data curation: DW, MA-G</p><p>Formal analysis: MA-G (lead), RMR (supporting)</p><p>Funding acquisition: SED, JRB, MEM</p><p>Investigation: MA-G</p><p>Methodology: MA-G, RMR, TAL (supporting)</p><p>Project administration: IMR (lead), SED (supporting)</p><p>Software: MA-G, DW</p><p>Supervision: MEM, JRB, SED</p><p>Validation: AKC, BWR, TAL, MEM</p><p>Visualization: MA-G</p><p>Writing&#x2014;original draft: MA-G (lead), RMR (supporting)</p><p>Writing&#x2014;review and editing: SED, MEM, DW, AKC, BWR, TAL, IMR, LMP, JRB</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI </term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BoW</term><def><p>bag of words</p></def></def-item><def-item><term id="abb3">COPD</term><def><p>chronic obstructive pulmonary disease</p></def></def-item><def-item><term id="abb4">CUI</term><def><p>Concept Unique Identifier</p></def></def-item><def-item><term id="abb5">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb6">ICD</term><def><p>International Classification of Diseases</p></def></def-item><def-item><term id="abb7">KNN</term><def><p>k-nearest neighbor</p></def></def-item><def-item><term id="abb8">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb9">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb10">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb11">RF</term><def><p>random forest</p></def></def-item><def-item><term id="abb12">TF-IDF</term><def><p>term frequency&#x2013;inverse document frequency</p></def></def-item><def-item><term id="abb13">UMLS</term><def><p>Unified Medical Language System</p></def></def-item><def-item><term id="abb14">VUMC</term><def><p>Vanderbilt University Medical Center</p></def></def-item><def-item><term id="abb15">XGBoost</term><def><p>extreme gradient boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="report"><article-title>Report of the Director: National Institutes of Health; Fiscal Years 2012 &#x0026; 2013</article-title><access-date>2026-04-20</access-date><publisher-name>National Institutes of Health</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://dpcpsi.nih.gov/sites/g/files/mnhszr346/files/NIH_OD_Biennial_report_2012-2013_508complete.pdf">https://dpcpsi.nih.gov/sites/g/files/mnhszr346/files/NIH_OD_Biennial_report_2012-2013_508complete.pdf</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bollmeier</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Hartmann</surname><given-names>AP</given-names> </name></person-group><article-title>Management of chronic obstructive pulmonary disease: a review focusing on exacerbations</article-title><source>Am J Health Syst Pharm</source><year>2020</year><month>02</month><day>7</day><volume>77</volume><issue>4</issue><fpage>259</fpage><lpage>268</lpage><pub-id pub-id-type="doi">10.1093/ajhp/zxz306</pub-id><pub-id pub-id-type="medline">31930287</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>Chronic obstructive pulmonary disease</article-title><source>Centers for Disease Control and Prevention</source><access-date>2020-05-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/copd/index.html">https://www.cdc.gov/copd/index.html</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clarke</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Bourn</surname><given-names>S</given-names> </name><name name-style="western"><surname>Skoufalos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Beck</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Castillo</surname><given-names>DJ</given-names> </name></person-group><article-title>An innovative approach to health care delivery for patients with chronic conditions</article-title><source>Popul Health Manag</source><year>2017</year><month>02</month><volume>20</volume><issue>1</issue><fpage>23</fpage><lpage>30</lpage><pub-id pub-id-type="doi">10.1089/pop.2016.0076</pub-id><pub-id pub-id-type="medline">27563751</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Veroff</surname><given-names>D</given-names> </name><name name-style="western"><surname>Marr</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wennberg</surname><given-names>DE</given-names> </name></person-group><article-title>Enhanced support for shared decision making reduced costs of care for patients with preference-sensitive conditions</article-title><source>Health Aff (Millwood)</source><year>2013</year><month>02</month><volume>32</volume><issue>2</issue><fpage>285</fpage><lpage>293</lpage><pub-id pub-id-type="doi">10.1377/hlthaff.2011.0941</pub-id><pub-id pub-id-type="medline">23381521</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Suominen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gedeon</surname><given-names>T</given-names> </name></person-group><article-title>Adapting state-of-the-art deep language models to clinical information extraction systems: potentials, challenges, and solutions</article-title><source>JMIR Med Inform</source><year>2019</year><month>04</month><day>25</day><volume>7</volume><issue>2</issue><fpage>e11499</fpage><pub-id pub-id-type="doi">10.2196/11499</pub-id><pub-id pub-id-type="medline">31021325</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wehbe</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>FS</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name></person-group><article-title>A comparative study of pretrained language models for long clinical text</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>01</month><day>18</day><volume>30</volume><issue>2</issue><fpage>340</fpage><lpage>347</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac225</pub-id><pub-id pub-id-type="medline">36451266</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dagdelen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dunn</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Structured information extraction from scientific text with large language models</article-title><source>Nat Commun</source><year>2024</year><month>02</month><day>15</day><volume>15</volume><issue>1</issue><fpage>1418</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-45563-x</pub-id><pub-id pub-id-type="medline">38360817</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A study of generative large language model for medical research and healthcare</article-title><source>NPJ Digit Med</source><year>2023</year><month>11</month><day>16</day><volume>6</volume><issue>1</issue><fpage>210</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00958-w</pub-id><pub-id pub-id-type="medline">37973919</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>N&#x00E9;v&#x00E9;ol</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dalianis</surname><given-names>H</given-names> </name><name name-style="western"><surname>Velupillai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zweigenbaum</surname><given-names>P</given-names> </name></person-group><article-title>Clinical natural language processing in languages other than English: opportunities and challenges</article-title><source>J Biomed Semantics</source><year>2018</year><month>03</month><day>30</day><volume>9</volume><issue>1</issue><fpage>12</fpage><pub-id pub-id-type="doi">10.1186/s13326-018-0179-8</pub-id><pub-id pub-id-type="medline">29602312</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><name name-style="western"><surname>He</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Clinical concept extraction: a methodology review</article-title><source>J Biomed Inform</source><year>2020</year><month>09</month><volume>109</volume><fpage>103526</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2020.103526</pub-id><pub-id pub-id-type="medline">32768446</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mujtaba</surname><given-names>G</given-names> </name><name name-style="western"><surname>Shuib</surname><given-names>L</given-names> </name><name name-style="western"><surname>Raj</surname><given-names>RG</given-names> </name><name name-style="western"><surname>Rajandram</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shaikh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Al-Garadi</surname><given-names>MA</given-names> </name></person-group><article-title>Classification of forensic autopsy reports through conceptual graph-based document representation model</article-title><source>J Biomed Inform</source><year>2018</year><month>06</month><volume>82</volume><fpage>88</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2018.04.013</pub-id><pub-id pub-id-type="medline">29738820</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Bruijn</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cranney</surname><given-names>A</given-names> </name><name name-style="western"><surname>O&#x2019;Donnell</surname><given-names>S</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Forster</surname><given-names>AJ</given-names> </name></person-group><article-title>Identifying wrist fracture patients with high accuracy by automatic categorization of X-ray reports</article-title><source>J Am Med Inform Assoc</source><year>2006</year><volume>13</volume><issue>6</issue><fpage>696</fpage><lpage>698</lpage><pub-id pub-id-type="doi">10.1197/jamia.M1995</pub-id><pub-id pub-id-type="medline">16929046</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mujtaba</surname><given-names>G</given-names> </name><name name-style="western"><surname>Shuib</surname><given-names>L</given-names> </name><name name-style="western"><surname>Raj</surname><given-names>RG</given-names> </name><name name-style="western"><surname>Rajandram</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shaikh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Al-Garadi</surname><given-names>MA</given-names> </name></person-group><article-title>Automatic ICD-10 multi-class classification of cause of death from plaintext autopsy reports through expert-driven feature selection</article-title><source>PLoS One</source><year>2017</year><volume>12</volume><issue>2</issue><fpage>e0170242</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0170242</pub-id><pub-id pub-id-type="medline">28166263</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fries</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Steinberg</surname><given-names>E</given-names> </name><name name-style="western"><surname>Khattar</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Ontology-driven weak supervision for clinical entity classification in electronic health records</article-title><source>Nat Commun</source><year>2021</year><month>04</month><day>1</day><volume>12</volume><issue>1</issue><fpage>2017</fpage><pub-id pub-id-type="doi">10.1038/s41467-021-22328-4</pub-id><pub-id pub-id-type="medline">33795682</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Suarez-Paniagua</surname><given-names>V</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Whitfield</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name></person-group><article-title>Rare disease identification from clinical notes with ontologies and weak supervision</article-title><source>Annu Int Conf IEEE Eng Med Biol Soc</source><year>2021</year><month>11</month><volume>2021</volume><fpage>2294</fpage><lpage>2298</lpage><pub-id pub-id-type="doi">10.1109/EMBC46164.2021.9630043</pub-id><pub-id pub-id-type="medline">34891745</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cusick</surname><given-names>M</given-names> </name><name name-style="western"><surname>Adekkanattu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Campion</surname><given-names>TR</given-names>  <suffix>Jr</suffix></name><etal/></person-group><article-title>Using weak supervision and deep learning to classify clinical notes for identification of current suicidal ideation</article-title><source>J Psychiatr Res</source><year>2021</year><month>04</month><volume>136</volume><fpage>95</fpage><lpage>102</lpage><pub-id pub-id-type="doi">10.1016/j.jpsychires.2021.01.052</pub-id><pub-id pub-id-type="medline">33581461</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sohn</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A clinical text classification paradigm using weak supervision and deep representation</article-title><source>BMC Med Inform Decis Mak</source><year>2019</year><month>01</month><day>7</day><volume>19</volume><issue>1</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1186/s12911-018-0723-6</pub-id><pub-id pub-id-type="medline">30616584</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>DD</given-names> </name></person-group><article-title>Machine intelligence in cardiovascular medicine</article-title><source>Cardiol Rev</source><year>2020</year><volume>28</volume><issue>2</issue><fpage>53</fpage><lpage>64</lpage><pub-id pub-id-type="doi">10.1097/CRD.0000000000000294</pub-id><pub-id pub-id-type="medline">32022759</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Naumann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name></person-group><article-title>Natural language processing for EHR-based computational phenotyping</article-title><source>IEEE/ACM Trans Comput Biol Bioinform</source><year>2019</year><volume>16</volume><issue>1</issue><fpage>139</fpage><lpage>153</lpage><pub-id pub-id-type="doi">10.1109/TCBB.2018.2849968</pub-id><pub-id pub-id-type="medline">29994486</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Acharya</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shrestha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Clinical risk prediction using language models: benefits and considerations</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1856</fpage><lpage>1864</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae030</pub-id><pub-id pub-id-type="medline">38412328</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tayebi Arasteh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lotfinia</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language models streamline automated machine learning for clinical studies</article-title><source>Nat Commun</source><year>2024</year><month>02</month><day>21</day><volume>15</volume><issue>1</issue><fpage>1603</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-45879-8</pub-id><pub-id pub-id-type="medline">38383555</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clusmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kolbinger</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Muti</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>The future landscape of large language models in medicine</article-title><source>Commun Med (Lond)</source><year>2023</year><month>10</month><day>10</day><volume>3</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id><pub-id pub-id-type="medline">37816837</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Al-Garadi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mungle</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ahmed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sarker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Matheny</surname><given-names>ME</given-names> </name></person-group><article-title>Large language models in healthcare</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 6, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.04748</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ford</surname><given-names>E</given-names> </name><name name-style="western"><surname>Carroll</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>HE</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cassell</surname><given-names>JA</given-names> </name></person-group><article-title>Extracting information from the text of electronic medical records to improve case detection: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2016</year><month>09</month><volume>23</volume><issue>5</issue><fpage>1007</fpage><lpage>1015</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocv180</pub-id><pub-id pub-id-type="medline">26911811</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Al-Garadi</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Book</surname><given-names>WM</given-names> </name><etal/></person-group><article-title>Supervised text classification system detects Fontan patients in electronic records with higher accuracy than ICD codes</article-title><source>J Am Heart Assoc</source><year>2023</year><month>07</month><day>4</day><volume>12</volume><issue>13</issue><fpage>e030046</fpage><pub-id pub-id-type="doi">10.1161/JAHA.123.030046</pub-id><pub-id pub-id-type="medline">37345821</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Teng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>LibN3L: a lightweight package for neural NLP</article-title><access-date>2026-04-21</access-date><conf-name>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC&#x2019;16)</conf-name><conf-date>May 23-28, 2016</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/L16-1034/">https://aclanthology.org/L16-1034/</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>van Krimpen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Spruit</surname><given-names>M</given-names> </name></person-group><article-title>A lightweight API-based approach for building flexible clinical NLP systems</article-title><source>J Healthc Eng</source><year>2019</year><volume>2019</volume><fpage>3435609</fpage><pub-id pub-id-type="doi">10.1155/2019/3435609</pub-id><pub-id pub-id-type="medline">31511785</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>Z</given-names> </name></person-group><article-title>On the use of silver standard data for zero-shot classification tasks in information extraction</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 28, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.18061</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name></person-group><article-title>Two directions for clinical data generation with large language models: data-to-label and label-to-data</article-title><source>Proc Conf Empir Methods Nat Lang Process</source><year>2023</year><month>12</month><volume>2023</volume><fpage>7129</fpage><lpage>7143</lpage><pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.474</pub-id><pub-id pub-id-type="medline">38213944</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>K</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sanghvi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>M</given-names> </name></person-group><article-title>A comparative analysis of logistic regression, random forest and KNN models for the text classification</article-title><source>Augment Hum Res</source><year>2020</year><volume>5</volume><issue>1</issue><fpage>12</fpage><pub-id pub-id-type="doi">10.1007/s41133-020-00032-0</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peterson</surname><given-names>LE</given-names> </name></person-group><article-title>K-nearest neighbor</article-title><source>Scholarpedia</source><year>2009</year><volume>4</volume><issue>2</issue><fpage>1883</fpage><pub-id pub-id-type="doi">10.4249/scholarpedia.1883</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>Scalable and flexible gradient boosting</article-title><source>DMLC XGBoost</source><access-date>2026-05-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://xgboost.ai/">https://xgboost.ai/</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>ZH</given-names> </name></person-group><article-title>Understanding bag-of-words model: a statistical framework</article-title><source>Int J Mach Learn Cybern</source><year>2010</year><month>12</month><volume>1</volume><issue>1</issue><fpage>43</fpage><lpage>52</lpage><pub-id pub-id-type="doi">10.1007/s13042-010-0001-0</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Ramos</surname><given-names>J</given-names> </name></person-group><article-title>Using TF-IDF to determine word relevance in document queries</article-title><source>Rutgers University</source><year>2003</year><access-date>2026-04-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://web.archive.org/web/20220401060524/http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.121.1424&#x0026;rep=rep1&#x0026;type=pdf">https://web.archive.org/web/20220401060524/http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.121.1424&#x0026;rep=rep1&#x0026;type=pdf</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>J</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Burges</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Bottou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Welling</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ghahramani</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name></person-group><article-title>Distributed representations of words and phrases and their compositionality</article-title><source>Advances in Neural Information Processing Systems</source><year>2013</year><publisher-name>Curran Associates</publisher-name><fpage>3111</fpage><lpage>3119</lpage></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Le</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name></person-group><article-title>Distributed representations of sentences and documents</article-title><source>Proc Mach Learn Res</source><year>2014</year><access-date>2026-04-29</access-date><volume>32</volume><issue>2</issue><fpage>1188</fpage><lpage>1196</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v32/le14.pdf">https://proceedings.mlr.press/v32/le14.pdf</ext-link></comment></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>J</given-names> </name></person-group><article-title>Efficient estimation of word representations in vector space</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 16, 2013</comment><pub-id pub-id-type="doi">10.48550/arXiv.1301.3781</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Vit&#x00E1;nyi</surname><given-names>PMB</given-names> </name></person-group><article-title>Normalized compression distance of multisets with applications</article-title><source>IEEE Trans Pattern Anal Mach Intell</source><year>2015</year><month>08</month><volume>37</volume><issue>8</issue><fpage>1602</fpage><lpage>1614</lpage><pub-id pub-id-type="doi">10.1109/TPAMI.2014.2375175</pub-id><pub-id pub-id-type="medline">26352998</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tsirlin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>J</given-names> </name></person-group><article-title>&#x201C;Low-resource&#x201D; text classification: a parameter-free classification method with compressors</article-title><access-date>2026-04-21</access-date><conf-name>Findings of the Association for Computational Linguistics: ACL 2023</conf-name><conf-date>Jul 9-14, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://scholar.google.com/citations?view_op=view_citation&#x0026;hl=en&#x0026;user=eJ5MnJ8AAAAJ&#x0026;citation_for_view=eJ5MnJ8AAAAJ:LkGwnXOMwfcC">https://scholar.google.com/citations?view_op=view_citation&#x0026;hl=en&#x0026;user=eJ5MnJ8AAAAJ&#x0026;citation_for_view=eJ5MnJ8AAAAJ:LkGwnXOMwfcC</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Unified Medical Language System term occurrences in clinical notes: a large-scale corpus analysis</article-title><source>J Am Med Inform Assoc</source><year>2012</year><month>06</month><volume>19</volume><issue>e1</issue><fpage>e149</fpage><lpage>e156</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000744</pub-id><pub-id pub-id-type="medline">22493050</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Humphreys</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Lindberg</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Schoolman</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Barnett</surname><given-names>GO</given-names> </name></person-group><article-title>The Unified Medical Language System: an informatics research collaboration</article-title><source>J Am Med Inform Assoc</source><year>1998</year><volume>5</volume><issue>1</issue><fpage>1</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.1136/jamia.1998.0050001</pub-id><pub-id pub-id-type="medline">9452981</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Resnik</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>J</given-names> </name></person-group><article-title>Evaluation of NLP systems</article-title><source>The Handbook of Computational Linguistics and Natural Language Processing</source><year>2010</year><publisher-name>John Wiley &#x0026; Sons</publisher-name><fpage>271</fpage><lpage>295</lpage></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ross Quinlan</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Top 10 algorithms in data mining</article-title><source>Knowl Inf Syst</source><year>2008</year><month>01</month><volume>14</volume><issue>1</issue><fpage>1</fpage><lpage>37</lpage><pub-id pub-id-type="doi">10.1007/s10115-007-0114-2</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><conf-name>KDD &#x2019;16: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Banerjee</surname><given-names>I</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name><name name-style="western"><surname>Seneviratne</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Weakly supervised natural language processing for assessing patient-centered outcome following prostate cancer treatment</article-title><source>JAMIA Open</source><year>2019</year><month>04</month><volume>2</volume><issue>1</issue><fpage>150</fpage><lpage>159</lpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooy057</pub-id><pub-id pub-id-type="medline">31032481</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary Tables 1 through 5 present performance comparisons for random forest, extreme gradient boosting, and k-nearest neighbors models on a chronic obstructive pulmonary disease dataset. These evaluations incorporate bag of words, term frequency-inverse document frequency, lightweight document embeddings, compression-based, and Unified Medical Language System representations. Supplementary Figure 1 illustrates variations in precision, sensitivity, and specificity across different thresholds.</p><media xlink:href="medinform_v14i1e84326_app1.docx" xlink:title="DOCX File, 90 KB"/></supplementary-material></app-group></back></article>