<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e63720</article-id><article-id pub-id-type="doi">10.2196/63720</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>An Extraction Tool for Venous Thromboembolism Symptom Identification in Primary Care Notes to Facilitate Electronic Clinical Quality Measure Reporting: Algorithm Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Novoa-Laurentiev</surname><given-names>John</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bowen</surname><given-names>Mica</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pullman</surname><given-names>Avery</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Song</surname><given-names>Wenyu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Syrowatka</surname><given-names>Ania</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Jin</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sainlaire</surname><given-names>Michael</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chang</surname><given-names>Frank</given-names></name><degrees>MSE</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gray</surname><given-names>Krissy</given-names></name><degrees>AAS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Panta</surname><given-names>Purushottam</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Luwei</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nawab</surname><given-names>Khalid</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hijjawi</surname><given-names>Shadi</given-names></name><degrees>MBA, MD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schreiber</surname><given-names>Richard</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Li</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dykes</surname><given-names>Patricia C</given-names></name><degrees>RN, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Medicine, Brigham &#x0026; Women's Hospital</institution><addr-line>75 Francis Street</addr-line><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Medicine, Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Medicine, University of Alabama at Birmingham</institution><addr-line>Birmingham</addr-line><addr-line>AL</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Biomedical Informatics, University of Kentucky</institution><addr-line>Lexington</addr-line><addr-line>KY</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Information Services, Penn State Health</institution><addr-line>Hershey</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Medicine, Penn State College of Medicine</institution><addr-line>Hershey</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff7"><institution>Department of Biomedical Informatics and Data Science, Johns Hopkins School of Medicine</institution><addr-line>Baltimore</addr-line><addr-line>MD</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Gaurav Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Arasteh</surname><given-names>Soroosh Tayebi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to John Novoa-Laurentiev, MS, Department of Medicine, Brigham &#x0026; Women's Hospital, 75 Francis Street, Boston, MA, 02115, United States, 1 8572824088; <email>jnovoa-laurentiev@bwh.harvard.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>26</day><month>8</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e63720</elocation-id><history><date date-type="received"><day>28</day><month>06</month><year>2024</year></date><date date-type="rev-recd"><day>05</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; John Novoa-Laurentiev, Mica Bowen, Avery Pullman, Wenyu Song, Ania Syrowatka, Jin Chen, Michael Sainlaire, Frank Chang, Krissy Gray, Purushottam Panta, Luwei Liu, Khalid Nawab, Shadi Hijjawi, Richard Schreiber, Li Zhou, Patricia C Dykes. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 26.8.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e63720"/><abstract><sec><title>Background</title><p>Diagnosis of venous thromboembolism (VTE) is often delayed, and facilitating earlier diagnosis may improve associated morbidity and mortality. Clinical notes contain information not found elsewhere in the medical record that could facilitate timely VTE diagnosis and accurate quality measurement. However, extracting relevant information from unstructured clinical notes is complex. Today, there are relatively few electronic clinical quality measures (eCQMs) in our national payment program and none that use natural language processing (NLP) techniques for data extraction. NLP holds great promise for making quality measurement more accurate and more efficient. Given the potential of NLP-based applications to facilitate more accurate VTE detection, primary care is one clinical setting in urgent need of this type of tool.</p></sec><sec><title>Objective</title><p>This study aimed to develop a tool that extracts VTE symptoms from clinical notes for use within an eCQM to quantify the rate of delayed diagnosis of VTE in primary care settings.</p></sec><sec sec-type="methods"><title>Methods</title><p>We iteratively developed an NLP-based data extraction tool, venous thromboembolism symptom extractor (VTExt), on an internal dataset using a rule-based approach to extract VTE symptoms from primary care clinical note text. The VTE symptoms lexicon was derived and optimized with physician guidance and externally validated using datasets from 2 independent health care organizations. We performed 26 rounds of performance evaluation of notes sampled from the case cohort (17,585 patient progress note sentences from 279 patient notes), and 5 rounds of evaluation of the control cohort (2838 patient progress note sentences from 50 patient notes). VTExt&#x2019;s performance was evaluated using evaluation metrics, including area under the curve, positive predictive value, negative predictive value, sensitivity, and specificity.</p></sec><sec sec-type="results"><title>Results</title><p>VTExt achieved near-perfect performance in extracting VTE symptoms from primary care notes sampled from records of patients diagnosed with or without VTE. In external validation, VTExt achieved promising performance in 2 additional geographically distant organizations using different electronic health record systems. When compared against a deep learning model and 4 machine learning models, VTExt exhibited similar or even improved performance across all metrics.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study demonstrates a data-driven NLP-based approach to clinical note information extraction that can be generalized to different electronic health record systems across different institutions. Due to the robust performance of this tool, VTExt is the first NLP application to be used in a nationally endorsed eCQM.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>venous thromboembolism</kwd><kwd>electronic clinical quality measure</kwd><kwd>timely diagnosis</kwd><kwd>primary care</kwd><kwd>NLP</kwd><kwd>thromboembolism</kwd><kwd>clinical quality</kwd><kwd>algorithm development</kwd><kwd>algorithm validation</kwd><kwd>VTE</kwd><kwd>diagnosis</kwd><kwd>primary care</kwd><kwd>extraction tool</kwd><kwd>tool</kwd><kwd>clinical note</kwd><kwd>extraction</kwd><kwd>AI</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Venous thromboembolism (VTE) is an often undetected condition that includes both deep vein thrombosis (clots in the deep veins of the body [<xref ref-type="bibr" rid="ref1">1</xref>]), and pulmonary embolism (PE; clot breaking free and entering the pulmonary arteries [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]). VTE is associated with increased morbidity and mortality [<xref ref-type="bibr" rid="ref3">3</xref>] with a 1-year VTE case-fatality rate estimated at 23% [<xref ref-type="bibr" rid="ref4">4</xref>] and associated with increased health care costs [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>The incidence of VTE in the United States is unknown as there is currently no national VTE surveillance system in place [<xref ref-type="bibr" rid="ref1">1</xref>]. Cases are often missed since they are asymptomatic or associated with symptoms similar to those of other chronic conditions, leading to substantial undercounting. In a 2015 literature review, Heit [<xref ref-type="bibr" rid="ref6">6</xref>] identified the incidence of VTE as ranging from 104 to 183 cases per 100,000 person-years. This rate is based largely on Caucasian populations [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref16">16</xref>] and differs by race where African American individuals face higher rates of VTE [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>], and Asian [<xref ref-type="bibr" rid="ref20">20</xref>], Asian American [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], and Native American individuals [<xref ref-type="bibr" rid="ref23">23</xref>] see a lower VTE incidence. Higher levels of education, income, and employment status have also been shown to be associated with decreased risk of VTE [<xref ref-type="bibr" rid="ref24">24</xref>]. Risk factors for VTE include a history of VTE [<xref ref-type="bibr" rid="ref25">25</xref>] (rates of recurrent VTE range from 20%&#x2010;36% within 10 years of the initial VTE event [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]), older age [<xref ref-type="bibr" rid="ref1">1</xref>], recent immobility or surgery, cancer, smoking, thrombophilia [<xref ref-type="bibr" rid="ref28">28</xref>], and obesity [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Delayed diagnosis of VTE is common due to its nonspecific symptoms [<xref ref-type="bibr" rid="ref29">29</xref>]. VTE can also be difficult to identify in the electronic health record (EHR) due to variability in how VTE is documented and coded [<xref ref-type="bibr" rid="ref30">30</xref>]. Due to these challenges and the lack of national surveillance, the incidence of VTE is likely underestimated [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Tools to facilitate measurement and earlier diagnoses of VTE may help in better understanding VTE risk factors, reduce associated morbidity and mortality [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>], and improve patient safety.</p><p>The widespread adoption of interoperable EHR systems after the 2009 Health Information Technology for Economic and Clinical Health Act [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>] has led to a significant increase in unstructured text data, such as radiology reports, progress notes, and discharge summaries [<xref ref-type="bibr" rid="ref37">37</xref>]. These unstructured data are estimated to constitute over 80% of health and biomedical information [<xref ref-type="bibr" rid="ref37">37</xref>]. Free-text clinical notes in EHRs hold valuable insights for population-level quality improvement, but efficient strategies leveraging AI, machine learning, and natural language processing (NLP) are essential to harness this potential.</p><p>NLP is useful for analyzing unstructured EHR data in areas like radiology [<xref ref-type="bibr" rid="ref38">38</xref>], oncology [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>], endocrinology [<xref ref-type="bibr" rid="ref41">41</xref>], substance misuse [<xref ref-type="bibr" rid="ref42">42</xref>], PE identification [<xref ref-type="bibr" rid="ref43">43</xref>], and postoperative VTE [<xref ref-type="bibr" rid="ref44">44</xref>]. By extracting information from text, NLP creates structured data, reducing manual review and enabling large-scale automated processing [<xref ref-type="bibr" rid="ref45">45</xref>]. High-throughput phenotyping algorithms using NLP-derived and structured data show promise for developing standardizing labeling [<xref ref-type="bibr" rid="ref46">46</xref>] particularly for managing complex diseases in large-scale patient populations. NLP can also uncover critical information overlooked using structured variables [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. While large language models (LLMs) are popular for NLP tasks, they are often more resource-intensive and costly than traditional machine learning or rule-based methods [<xref ref-type="bibr" rid="ref49">49</xref>]. Though machine learning methods tend to have improved performance, a rule-based approach has advantages, such as traceability of results and speed of development [<xref ref-type="bibr" rid="ref50">50</xref>].</p><p>NLP tools can detect VTE events, but more sensitive tools are needed to identify VTE events specifically in primary care EHR progress notes [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. The objective of this study was to develop a simple, accessible NLP tool for identifying VTE symptoms in primary care EHRs, suitable for both high- and low-resource settings and aligned with the national quality payment program. The tool was tested on external datasets to evaluate its performance compared with deep learning and machine learning models. This main aim is to use narrative EHR data for clinical quality reporting to identify missed or delayed diagnoses of VTE after a primary care visit. A delayed diagnosis is defined as one that occurs &#x003E;24 hours after the primary care visit when the VTE symptoms were documented.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Sources, Cohort Development, and Feature Selection Strategy</title><p>The study was conducted at Mass General Brigham (MGB), an integrated health care delivery system in Greater Boston, Massachusetts, using data from the MGB Enterprise Data Warehouse (EDW), an MGB central clinical data warehouse.</p><p>We used 2 internal datasets to develop and evaluate our NLP application for symptom extraction, and 2 independent external datasets to test how well it works in other settings. The first internal dataset, the case cohort, was used for development and evaluation. Inclusion criteria for this cohort are described below. The second internal dataset, the control cohort, included patients who did not meet case cohort inclusion criteria and was used for further evaluation. The external validation datasets came from 2 university health systems: the University of Kentucky and Penn State Health. These datasets were used to test if our symptom extractor works well with notes from different EHR vendors and health care systems.</p><p>We developed a multifactor phenotyping algorithm to identify VTE patients in the MGB cohort [<xref ref-type="bibr" rid="ref52">52</xref>]. This included patients diagnosed with VTE from 2016 through 2021 who had a primary care visit in the 30 days before the date of diagnosis. We started by using <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) codes to identify an initial VTE patient cohort. Then we combined data from imaging records (eg, current procedural terminology [CPT] codes) and anticoagulant orders (RxNorm codes) to further refine the initial cohort and develop the final VTE case cohort. The diagnosis date and time of VTE diagnosis was defined as when the radiologist signed off on the scan report [<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>].</p><p>We used a rule-based approach to identify terms from a lexicon derived from a set of VTE signs and symptoms. The lexicon was divided into 3 parts: one with relevant symptoms dependent on the part of the body (eg, swelling), another with the relevant symptom locations (eg, leg), and the last containing location-independent symptoms (eg, cough). Location-dependent symptoms required identification of both the symptom and a relevant location to be considered a symptom match. The lexicon was reviewed and revised over the course of the study in accordance with physician expert guidance.</p></sec><sec id="s2-2"><title>Clinician-Guided VTE Lexicon Development and Optimization</title><p>We identified VTE-related signs and symptoms by combining a literature review with interviews of physicians with experience in treating VTE patients. Multiple optimization steps were conducted: first, we conducted a comprehensive literature review to create an initial list of signs and symptoms. Then, we held 1-hour semistructured interviews with 5 experienced physicians to provide additional insight into signs and symptoms based on clinical experience. Signs and symptoms were also reviewed by a technical expert panel over the course of development, and their feedback was used to finalize the lexicon. In total, we included 29 distinct symptoms in the lexicon, consisting of 7 location-independent symptoms, 7 location-dependent symptoms, and 4 relevant locations. The final VTE symptom 3-part lexicon can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Inclusion criteria <italic>ICD-10</italic>, CPT, and RxNorm codes are provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. The prevalence of each symptom in each dataset is provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec><sec id="s2-3"><title>Extractor Development and Optimization</title><p>The Medical Text Extraction, Reasoning and Mapping System (MTERMS) [<xref ref-type="bibr" rid="ref54">54</xref>] venous thromboembolism symptom extractor (VTExt) was developed using the Python programming language. We chose a rule-based approach to identify symptoms in order to facilitate transferability of the tool and to ensure transparency of its workings, which can be challenging when using more complex machine learning or LLM-based approaches [<xref ref-type="bibr" rid="ref55">55</xref>]. Using a rule-based approach also suited the need for VTExt to identify VTE symptoms within specific contexts, for example, at specific body locations.</p><p>The development cycle used in the creation of VTExt entailed initial analysis of symptom extractor requirements, design and implementation of the extractor, iterative testing on samples of patient notes, and adjusting VTExt based on error analyses. The overall study design and development process is provided in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>We first reviewed a small sample of cases from the dataset described above to understand how VTE symptoms appear in primary care progress notes, for example, how providers document VTE symptoms. The initial version of VTExt was then used to extract symptoms from a batch of sampled primary care progress notes. A trained chart abstractor reviewed each sentence analyzed by VTExt and determined whether the structured output was accurate, marking each case as a true positive, true negative, false positive, or false negative. Whenever an error occurred, the reason was identified, and adjustments were made to the extractor to avoid the error in the processing of future batches. We repeated this optimization process of running the extractor on a new sample of 10&#x2010;15 notes, reviewing output, and refining the pattern-matching to iteratively improve the performance of the symptom extractor until we achieved a precision (positive predictive value [PPV]) of at least 0.95.</p><p>For each round, one progress note from each patient visit was extracted and combined into a single file. Patient notes were split into sentences using the MTERMS NLP system [<xref ref-type="bibr" rid="ref54">54</xref>]. The symptom extractor then used regular expression-based rules to identify signs and symptoms of VTE in the curated lexicon and wrote output to a structured query language database to allow for integration of extractor output into other pipelines, including mapping symptoms to standardized terminologies. The NLP output table contains one column for each VTE symptom in the lexicon. Each row in the table corresponds to 1 patient note, and a binary output value for each symptom field indicates whether a given symptom was detected in the note by VTExt&#x2014;if yes, presence was indicated with a value of &#x201C;1,&#x201D; and if not, a value of &#x201C;0.&#x201D;</p><p>To facilitate the clinical implementation of our tool, we developed a streamlined version of VTExt with simplified output for use with the electronic clinical quality measure (eCQM). Instead of producing output values for presence of individual signs and symptoms, this version produced a single &#x201C;0&#x201D; or &#x201C;1&#x201D; value for each patient note to indicate whether at least 1 VTE symptom was identified. This streamlined version of VTExt was used in the external evaluation of the tool. Pseudocode for the tool can be found on our project GitHub page [<xref ref-type="bibr" rid="ref56">56</xref>].</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Venous thromboembolism symptom extractor development and evaluation process. EDW; enterprise data warehouse; EHR; electronic health record; MGB; Mass General Brigham; VTExt: venous thromboembolism symptom extractor.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e63720_fig01.png"/></fig></sec><sec id="s2-4"><title>Note Processing and Evaluation</title><p>We evaluated the VTExt symptom extractor using both internal and external datasets. For internal evaluation, we used both a case cohort and a control cohort. The case cohort included patients who met our inclusion criteria for incident VTE based on the presence of 3 codes; <italic>ICD-10</italic> VTE codes, CPT imaging codes, and RxNorm anticoagulation codes [<xref ref-type="bibr" rid="ref53">53</xref>]. The control cohort included patients who did not meet these criteria.</p><p>Internal evaluation of the VTExt symptom extractor was an iterative process, illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>. From all patients who met the case cohort inclusion criteria, we randomly selected batches of 10 to 15 patient visits for each round of testing. We used a similar method to sample control notes to evaluate how well the symptom extractor generalized to patients that did not meet the case cohort inclusion criteria (eg, patients who did not have a VTE diagnosis).</p></sec><sec id="s2-5"><title>External Evaluation</title><p>We worked with collaborators at both the University of Kentucky and Penn State Health (PSH) to test VTExt on patient notes. These sites used different EHR systems which also differed from MGB and had different textual data structures. In Epic (used at MGB), patient notes exist in tables, which include note-related information including metadata and the note content itself. Veradigm (formerly Allscripts; used by University of Kentucky) and Oracle Cerner (used by PSH) similarly store patient note data in document tables. For free text notes in Veradigm and Oracle Cerner, note contents of many documents are stored in &#x201C;Character Large Objects&#x201D; or &#x201C;Binary Large Objects&#x201D; fields. Notes in these areas of the database require special querying techniques to extract unstructured text, usually requiring certified analysts. Despite these differences, once note text data are available, the NLP tool functions properly irrespective of the EHR as it is not dependent on the EHR itself.</p><p>In addition, each system served a different population: MGB serving mostly urban and metro, University of Kentucky serving more rural, and PSH serving a mixed population of urban, metro, and rural. The diversity of sites included served as a good preliminary test for generalizability of VTExt.</p><p>During external evaluation, we compared the performance of the rule-based extractor against a pretrained sequence classification deep learning model derived from Bio+Clinical BERT (bidirectional encoder representations from transformers; using the HuggingFace transformers Python package), a contextualized word representation model based on BioBERT and trained further on Medical Information Mart for Intensive Care (MIMIC) data [<xref ref-type="bibr" rid="ref57">57</xref>-<xref ref-type="bibr" rid="ref60">60</xref>]. We also compared performance against 4 classical machine learning models: logistic regression, support vector machine (SVM), and random forest, implemented using the Python Scikit-learn module, and gradient boosting, implemented using the Python XGBoost module [<xref ref-type="bibr" rid="ref61">61</xref>-<xref ref-type="bibr" rid="ref63">63</xref>]. MGB data used during the development of VTExt were preprocessed using the Bio+Clinical BERT tokenizer for further training of the deep learning model. For training the four classical models, the MGB data were instead represented as unigrams transformed using term frequency&#x2014;inverse document frequency (TF-IDF) [<xref ref-type="bibr" rid="ref64">64</xref>]. For all models, data were divided into training and validation sets for training and tuning of model parameters, respectively. Final parameters for deep learning and machine learning models are provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. Each external site manually labeled a testing set of 500 note sentences for evaluation, 250 containing at least one VTE symptom and 250 with no VTE symptoms.</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This project was reviewed and approved by the Mass General Brigham institutional review board (protocol #2020P003979). In this protocol, a waiver of informed consent and a waiver of HIPAA (Health Insurance Portability and Accountability Act) authorization was requested because this quality improvement research involves no more than minimal risk to the participants and the research could not practicably be carried out without the waiver given the large number of patients who had a VTE diagnosis in a primary care setting. In addition, this research could not practicably be conducted without access to and use of the protected health information. The following procedures were followed to prevent breach in confidentiality: (1) data were accessed only behind MGB firewall using password-protected, secure devices by Collaborative Institutional Training Initiative&#x2013;certified study staff, and (2) we will destroy all patient identifiers at the end of the study, once analysis and publications are finalized. In accordance with the approved institutional review board protocol, all electronic data were kept in password-protected files on a secure server behind the MGB firewall. Only study personnel were given a unique identifier&#x2014;no participant identifiers are linked to the data. No compensation was provided for participation.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>We performed 26 rounds of evaluation of VTExt performance on notes sampled from the case cohort. This included 17,585 patient progress note sentences from 279 notes from distinct patients, 171 of which were found to contain 1 or more VTE symptoms. Evaluation of the control cohort included 2838 note sentences from 50 patient notes over the course of five rounds of evaluation, of which 21 notes contained at least 1 relevant symptom.</p><p>Performance was evaluated at the sentence level. We measured precision (PPV), recall (sensitivity), specificity, and negative predictive value (NPV; <xref ref-type="table" rid="table1">Table 1</xref>). Of these metrics, achieving a high precision score proved to be the greatest challenge. Many false positives initially arose due to 3 kinds of errors, shown in <xref ref-type="table" rid="table2">Table 2</xref>. Some errors were due to word misspellings in the notes (which we refer to as type A errors). For example, misspelling of the word &#x201C;denies&#x201D; caused VTExt to miss negation of subsequent VTE symptoms. In other cases, an error occurred because a symptom was identified but was attributed to the incorrect body part (a type B error). Many false positives arose in early stages of evaluation from failure to detect negation or context, as in the Type C error examples in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Venous thromboembolism symptom extractor validation performance on notes of case cohort (patients with venous thromboembolism diagnosis).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Validation round</td><td align="left" valign="bottom">Patients, n</td><td align="left" valign="bottom">Precision (positive predictive value)</td><td align="left" valign="bottom">Recall (sensitivity)</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom">Negative predictive value</td></tr></thead><tbody><tr><td align="left" valign="top">Round 1</td><td align="char" char="." valign="top">673</td><td align="char" char="." valign="top">0.500</td><td align="char" char="." valign="top">0.863</td><td align="char" char="." valign="top">0.929</td><td align="char" char="." valign="top">0.988</td></tr><tr><td align="left" valign="top">Round 9</td><td align="char" char="." valign="top">692</td><td align="char" char="." valign="top">0.851</td><td align="char" char="." valign="top">0.966</td><td align="char" char="." valign="top">0.984</td><td align="char" char="." valign="top">0.997</td></tr><tr><td align="left" valign="top">Round 17</td><td align="char" char="." valign="top">489</td><td align="char" char="." valign="top">0.750</td><td align="char" char="." valign="top">1.000</td><td align="char" char="." valign="top">0.998</td><td align="char" char="." valign="top">1.000</td></tr><tr><td align="left" valign="top">Round 26</td><td align="char" char="." valign="top">938</td><td align="char" char="." valign="top">1.000</td><td align="char" char="." valign="top">1.000</td><td align="char" char="." valign="top">1.000</td><td align="char" char="." valign="top">1.000</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Examples of common sources of symptom extractor false positive errors.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Error type</td><td align="left" valign="bottom">Examples</td></tr></thead><tbody><tr><td align="left" valign="top">Type A: misspelling</td><td align="left" valign="top">She &#x201C;deneis&#x201D; shortness of breath or pleuritic chest pain</td></tr><tr><td align="left" valign="top">Type B: symptom attributed to wrong body part</td><td align="left" valign="top">Worsening R hip &#x201C;pain&#x201D; as well as recent development of R &#x201C;leg,&#x201D; ankle, and foot erythema</td></tr><tr><td align="left" valign="top">Type C: negation or context</td><td align="left" valign="top">&#x201C;Resolution&#x201D; of hypoxia and chest pain. Nitroglycerin 0.4 MG SL tablet place 1 tablet (0.4 mg total) under the tongue every 5 (five) minutes &#x201C;as needed&#x201D; for chest pain</td></tr></tbody></table></table-wrap><p>For the first example, VTExt captured the symptom hypoxia without identifying the negating phrase &#x201C;resolution of.&#x201D; In the second example, though chest pain is mentioned, it appears in the context of a medication to be taken as needed, which we deemed not to be strong enough evidence of the presence of a symptom. Repeated validation allowed us to learn what contexts and negating phrases appeared in clinical text, and this knowledge was used to improve VTExt&#x2019;s ability to locate them. Through this process, precision improved from 0.5 in the first round of testing to near-perfect in the final round. Near-perfect performance was also achieved for recall, specificity, and NPV in the final round of validation. In addition, we tested the extractor on several random samples of primary care clinical notes of patients in the cohort, that is, those not diagnosed with VTE (<xref ref-type="table" rid="table3">Table 3</xref>, in batches of 10&#x2010;15 notes, with precision ultimately reaching 0.85).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Venous thromboembolism symptom extractor validation performance on notes of the control cohort (patients with no venous thromboembolism diagnosis).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Validation round</td><td align="left" valign="bottom">N</td><td align="left" valign="bottom">Precision (positive predictive value)</td><td align="left" valign="bottom">Recall (sensitivity)</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom">Negative predictive value</td></tr></thead><tbody><tr><td align="left" valign="top">Round 1</td><td align="left" valign="top">281</td><td align="left" valign="top">0.533</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.974</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top">Round 2</td><td align="left" valign="top">471</td><td align="left" valign="top">0.556</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.991</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top">Round 3</td><td align="left" valign="top">613</td><td align="left" valign="top">0.750</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.998</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top">Round 4</td><td align="left" valign="top">559</td><td align="left" valign="top">0.806</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.989</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top">Round 5</td><td align="left" valign="top">912</td><td align="left" valign="top">0.850</td><td align="left" valign="top">0.895</td><td align="left" valign="top">0.997</td><td align="left" valign="top">0.998</td></tr></tbody></table></table-wrap><p>As seen in the external evaluation results in <xref ref-type="table" rid="table4">Table 4</xref>, performance metrics for the rule-based extractor were similar or better than those for the deep learning and machine learning models at both external testing sites. While VTExt&#x2019;s precision and specificity scored high, sensitivity showed room for improvement (0.61 and 0.66 at PSH and University of Kentucky, respectively).</p><p>Error analysis of external testing results showed many deep learning model false negatives falling into 2 categories. Some errors can be attributed to overrepresentation of negated instances of certain VTE symptoms in the training dataset. This then makes the model more inclined to mark note sentences containing said symptoms as negative, even when the symptom is not negated. For the second category, less common terms used to describe relevant symptoms appear in testing data, for example, &#x201C;malleoli&#x201D; used in describing swelling of ankle. If such terms are not present in the training data, the model has no way of knowing they are relevant.</p><p>The rule-based model also produced false negatives, many belonging to one of two types. First, some errors can be attributed to double negation, which VTExt is not currently able to handle. For example, &#x201C;SOB not resolved&#x201D;&#x2014;here, we see a VTE symptom, shortness of breath (SOB), followed by negating term &#x201C;resolved.&#x201D; However, &#x201C;resolved&#x201D; itself has been negated, and so this represents a positive instance. The second error type pertains to synonymous terms of phrases of VTE symptoms that are not currently included in the lexicon, for example, &#x201C;black and blue area&#x201D; as another way to phrase bruising. Since the phrase &#x201C;black and blue area&#x201D; is not part of the symptom lexicon, the rule-based model did not detect the symptom.</p><p>The results for the eCQM have been reported elsewhere [<xref ref-type="bibr" rid="ref53">53</xref>]. The calculated rate of delayed VTE diagnosis was over 70% at both MGB and University of Kentucky, suggesting a clinically and practically meaningful measure for understanding delayed diagnosis rates across diverse health care sites.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Performance of venous thromboembolism symptom extractor, deep learning, and machine learning models at University of Kentucky and Penn State Health sites.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric (95% CI)</td><td align="left" valign="bottom">PPV<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom">NPV<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">University of Kentucky</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>VTExt<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.66</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.83</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Deep learning</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">0.63 (0.58&#x2010;0.68)</td><td align="left" valign="top">0.42 (0.35&#x2010;0.48)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">0.71 (0.67&#x2010;0.75)</td><td align="left" valign="top">0.71 (0.68&#x2010;0.74)</td><td align="left" valign="top">0.85 (0.83&#x2010;0.87)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.98 (0.96&#x2010;1.00)</td><td align="left" valign="top">0.71 (0.66&#x2010;0.76)</td><td align="left" valign="top">0.60 (0.53&#x2010;0.66)</td><td align="left" valign="top">0.99 (0.97&#x2010;1.00)</td><td align="left" valign="top">0.79 (0.76&#x2010;0.83)</td><td align="left" valign="top">0.79 (0.76&#x2010;0.82)</td><td align="left" valign="top">0.89 (0.87&#x2010;0.91)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Logistic regression</td><td align="left" valign="top">0.95 (0.87&#x2010;1.00)</td><td align="left" valign="top">0.54 (0.49&#x2010;0.58)</td><td align="left" valign="top">0.16 (0.11&#x2010;0.20)</td><td align="left" valign="top">0.99 (0.98&#x2010;1.00)</td><td align="left" valign="top">0.57 (0.53&#x2010;0.62)</td><td align="left" valign="top">0.57 (0.55&#x2010;0.60)</td><td align="left" valign="top">0.76 (0.71&#x2010;0.80)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Random forest</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">0.52 (0.48&#x2010;0.57)</td><td align="left" valign="top">0.08 (0.05&#x2010;0.12)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">0.54 (0.50&#x2010;0.58)</td><td align="left" valign="top">0.54 (0.52&#x2010;0.56)</td><td align="left" valign="top">0.77 (0.75&#x2010;0.79)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SVM<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">0.98 (0.94&#x2010;1.00)</td><td align="left" valign="top">0.56 (0.51&#x2010;0.61)</td><td align="left" valign="top">0.22 (0.16&#x2010;0.27)</td><td align="left" valign="top">1.00 (0.99&#x2010;1.00)</td><td align="left" valign="top">0.61 (0.56&#x2010;0.65)</td><td align="left" valign="top">0.61 (0.58&#x2010;0.63)</td><td align="left" valign="top">0.79 (0.76&#x2010;0.82)</td></tr><tr><td align="left" valign="top">PSH</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>VTExt</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.84</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.87</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Deep learning</td><td align="left" valign="top">0.90 (0.85&#x2010;0.94)</td><td align="left" valign="top">0.82 (0.79&#x2010;0.84)</td><td align="left" valign="top">0.55 (0.49&#x2010;0.60)</td><td align="left" valign="top">0.97 (0.96&#x2010;0.98)</td><td align="left" valign="top">0.83 (0.81&#x2010;0.86)</td><td align="left" valign="top">0.76 (0.73&#x2010;0.79)</td><td align="left" valign="top">0.80 (0.77&#x2010;0.83)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.87 (0.82&#x2010;0.91)</td><td align="left" valign="top">0.82 (0.80&#x2010;0.85)</td><td align="left" valign="top">0.58 (0.53&#x2010;0.63)</td><td align="left" valign="top">0.96 (0.94&#x2010;0.97)</td><td align="left" valign="top">0.83 (0.81&#x2010;0.86)</td><td align="left" valign="top">0.77 (0.74&#x2010;0.80)</td><td align="left" valign="top">0.79 (0.76&#x2010;0.83)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Logistic regression</td><td align="left" valign="top">0.86 (0.80&#x2010;0.91)</td><td align="left" valign="top">0.76 (0.73&#x2010;0.79)</td><td align="left" valign="top">0.37 (0.32&#x2010;0.43)</td><td align="left" valign="top">0.97 (0.96&#x2010;0.98)</td><td align="left" valign="top">0.77 (0.75&#x2010;0.8)</td><td align="left" valign="top">0.67 (0.65&#x2010;0.70)</td><td align="left" valign="top">0.72 (0.68&#x2010;0.76)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Random forest</td><td align="left" valign="top">0.95 (0.88&#x2010;1.00)</td><td align="left" valign="top">0.70 (0.67&#x2010;0.73)</td><td align="left" valign="top">0.12 (0.08&#x2010;0.15)</td><td align="left" valign="top">1.00 (0.99&#x2010;1.00)</td><td align="left" valign="top">0.71 (0.68&#x2010;0.74)</td><td align="left" valign="top">0.56 (0.54&#x2010;0.58)</td><td align="left" valign="top">0.68 (0.64&#x2010;0.71)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SVM</td><td align="left" valign="top">0.87 (0.82&#x2010;0.92)</td><td align="left" valign="top">0.77 (0.74&#x2010;0.80)</td><td align="left" valign="top">0.40 (0.35&#x2010;0.45)</td><td align="left" valign="top">0.97 (0.96&#x2010;0.98)</td><td align="left" valign="top">0.78 (0.76&#x2010;0.81)</td><td align="left" valign="top">0.69 (0.66&#x2010;0.71)</td><td align="left" valign="top">0.73 (0.70&#x2010;0.77)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>PPV: positive predictive value.</p></fn><fn id="table4fn2"><p><sup>b</sup>NPV: negative predictive value.</p></fn><fn id="table4fn3"><p><sup>c</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table4fn4"><p><sup>d</sup>AUPRC: area under the precision-recall curve.</p></fn><fn id="table4fn5"><p><sup>e</sup>VTExt: venous thromboembolism symptom extractor.</p></fn><fn id="table4fn6"><p><sup>f</sup>Not available.</p></fn><fn id="table4fn7"><p><sup>g</sup>SVM: support vector machine.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Much of the data not captured in structured EHR fields, like patient symptoms, are found in clinical notes [<xref ref-type="bibr" rid="ref48">48</xref>]. In this study, we developed and validated a simple and generalizable NLP tool to identify and extract signs and symptoms of VTE from primary care notes through an iterative optimization process. VTExt is novel as the first NLP application linked to a nationally endorsed eCQM [<xref ref-type="bibr" rid="ref65">65</xref>], helping to quantify the rate of delayed diagnosis of VTE in primary care. Through multiple rounds of optimization, VTExt showed robust performance and speed. Testing at two external sites demonstrated its ability to work well with different datasets and system configurations and its potential for optimizing quality measurement. We suggest that analysts familiar with their EHR and its local configurations could readily apply this NLP tool to their patient notes.</p><p>We learned several important lessons during optimization. Reducing the prevalence of false positives was crucial for improving extractor performance. In early rounds of validation, type B and type C errors often arose in long sentences due to a lack of constraint on the allowed search distance between a VTE symptom and a body part, or between a negating or contextual phrase and a symptom. We experimented with search distances of various lengths and found a distance of 150 characters struck a good balance of incorporating context without introducing too much noise, improving precision while maintaining high sensitivity.</p><p>We focused on primary care progress notes for developing and testing VTExt. Our external evaluation indicated that differences in note styles and hospital policies can affect performance. However, consistent performance observed between the 2 external sites highlighted VTExt&#x2019;s strong generalizability. VTExt&#x2019;s rule-based approach offers advantages including easier implementation, faster processing, and easier interpretation of results when compared with the tested machine learning and deep learning models. Error analysis also revealed further improvement opportunities for the symptom extractor. Working with collaborators at external sites to further refine VTExt to reduce false negatives would prove beneficial in improving sensitivity and NPV.</p></sec><sec id="s4-2"><title>Comparison With Previous Work</title><p>Shi et al [<xref ref-type="bibr" rid="ref44">44</xref>] developed an NLP tool to detect postoperative VTE from free-text EHR notes. Internal validation demonstrated a sensitivity of 0.71 and specificity of 0.99. In the 2 health care systems tested, this NLP approach demonstrated superior performance in DVT surveillance than existing tools, and similar performance in PE surveillance compared with existing tools. Chapman et al [<xref ref-type="bibr" rid="ref51">51</xref>] developed an NLP-based application to classify pulmonary angiography reports for document-level identification of PE, with test set performance resulting in sensitivity of 0.98 and PPV of 0.83. Sabra et al [<xref ref-type="bibr" rid="ref66">66</xref>] incorporated Unified Medical Language System concept mapping into an NLP tool to generate feature vectors. These were then used to train and test an SVM machine learning model that achieved a PPV and sensitivity of 0.55 and 0.86, respectively. Work done by Jin et al [<xref ref-type="bibr" rid="ref67">67</xref>] to identify VTE in inpatient notes using rule-based NLP methods highlights an approach that achieved similar performance to VTExt (0.90 sensitivity, 1.0 specificity), splitting notes into sentences, and then aggregating sentence-level information to make VTE inference at the sentence, document, and patient level. Although many of their tools would not be sufficient for our study&#x2019;s goal of VTE symptom identification for quantifying delayed diagnosis, these studies show that NLP tools can effectively identify VTE events, and there is a need for more sensitive tools to identify VTE events using EHR progress notes in the primary care setting.</p></sec><sec id="s4-3"><title>Limitations</title><p>Our study has a number of limitations. First, VTExt is currently unable to handle misspellings in note text. Revising VTExt to handle misspellings would result in improved performance. Second, MGB was unable to view clinical note data used by external sites in the testing of VTExt in order to maintain patient data privacy. This reduced our ability to improve the tool&#x2019;s generalizability, as MGB was unable to directly review output from the University of Kentucky and PSH other than performance metrics. Third, development and refinement of VTExt was based on 279 patient notes. While high performance was achieved, a wider dataset would provide additional context and understanding of the ways VTE symptoms are documented in clinical note text, allowing for further improvement of the tool.</p></sec><sec id="s4-4"><title>Future Directions</title><p>While a rule-based approach was simpler to implement, future improvements in accessible, high-performance LLMs could make them useful and feasible for quality measurement. These tools have already shown good results in extracting information from radiology reports [<xref ref-type="bibr" rid="ref68">68</xref>], and could also be used to extract signs and symptoms from other types of clinical notes. Since LLMs are trained on large volumes of data, such an approach may generalize better across different health care systems and differently formatted notes when compared with a rule-based method. An LLM approach may more easily generalize to extracting symptoms from types of notes other than primary care progress notes, a logical future direction for research in this area. An immediate LLM-based approach was not pursued because we began this project in 2020 before there was mass public access to LLMs. While LLMs prove a promising direction for future work, the cost, time, and knowledge required to test such an approach at the collaborating sites were real limiting factors. In addition to an LLM approach, future work to improve model performance could include expanding the lexicon of symptom synonyms, as well as more robust handling of context and negation.</p><p>In addition to an LLM approach, future work to improve model performance could include expanding the lexicon of symptom synonyms, as well as more robust handling of context and negation.</p></sec><sec id="s4-5"><title>Conclusions</title><p>We developed a robust and efficient NLP-based tool, VTExt, to extract VTE-associated symptoms from primary care notes. VTExt achieved high sensitivity and specificity, performance that matches or exceeds that of deep learning models and demonstrates its reliability for clinical use. High sensitivity ensures that most patients with VTE symptoms are correctly identified, reducing the risk of missed or delayed diagnoses, which can have serious or fatal consequences. High specificity minimizes false positives, helping avoid unnecessary tests, anxiety, and resource use. Together, these metrics underscore VTExt&#x2019;s clinical value in supporting timely, accurate identification of potential VTE cases from unstructured data.</p><p>VTExt&#x2019;s generalizability across health care systems further supports its real-world applicability, enabling scalable deployment in diverse EHR environments. Its rule-based design facilitates transparency and ease of implementation, particularly for quality measurement initiatives such as tracking delayed diagnosis. Furthermore, the clinician-guided optimization process developed alongside VTExt provides a replicable framework for future NLP tool development and integration into clinical workflows, helping bridge the gap between EHR data and actionable insights for patient safety and care improvement.</p></sec></sec></body><back><ack><p>This study was supported by the Gordon and Betty Moore Foundation Diagnostic Delay Of VTE (DOVE) grant, and by the Penn State Clinical and Translational Research Institute, Penn State University Clinical and Translational Science Award, National Institutes of Health/National Center for Advancing Translational Sciences (grant number UL1 TR000127). The contents are solely the responsibility of the authors and do not necessarily represent the official views of the National Institutes of Health or National Center for Advancing Translational Sciences.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated and analyzed during this study are not publicly available as they contain protected health information from patient health records.</p></sec></notes><fn-group><fn fn-type="con"><p>JN was responsible for formal analysis, investigation, methodology, software, validation, visualization, writing the original draft, and reviewing and editing the draft. MB contributed to project administration and writing the original draft. AP handled formal analysis and project administration. WS managed conceptualization, methodology, supervision, and reviewing and editing the manuscript draft. AS contributed to conceptualization, formal analysis, methodology, supervision, and reviewing and editing the manuscript draft. JC conducted investigation, methodology, resources, and supervision. MS managed data curation, investigation, and software. FC handled data curation and software. KG was responsible for data curation, investigation, and software. PP handled data curation, formal analysis, software, and validation. LL conducted formal analysis. KN contributed to data curation, funding acquisition, investigation, software, and validation. SH handled supervision, reviewing, and editing. RS managed funding acquisition, investigation, methodology, resources, supervision, writing the original draft, and reviewing and editing the manuscript draft. LZ was responsible for investigation, resources, supervision, and reviewing and editing the manuscript. PCD contributed to conceptualization, funding acquisition, investigation, methodology, resources, supervision, and review and editing the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUPRC</term><def><p>area under the precision-recall curve</p></def></def-item><def-item><term id="abb2">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb3">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb4">DVT</term><def><p>deep vein thrombosis</p></def></def-item><def-item><term id="abb5">eCQM</term><def><p>electronic clinical quality measure</p></def></def-item><def-item><term id="abb6">EDW</term><def><p>enterprise data warehouse</p></def></def-item><def-item><term id="abb7">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb8">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb9"><italic>ICD-10</italic></term><def><p><italic>International Statistical Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb10">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb11">MGB</term><def><p>Mass General Brigham</p></def></def-item><def-item><term id="abb12">MIMIC</term><def><p>Medical Information Mart for Intensive Care</p></def></def-item><def-item><term id="abb13">MTERMS</term><def><p>Medical Text Extraction, Reasoning and Mapping System</p></def></def-item><def-item><term id="abb14">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb15">NPV</term><def><p>negative predictive value</p></def></def-item><def-item><term id="abb16">PE</term><def><p>pulmonary embolism</p></def></def-item><def-item><term id="abb17">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb18">PSH</term><def><p>Penn State Health</p></def></def-item><def-item><term id="abb19">SOB</term><def><p>shortness of breath</p></def></def-item><def-item><term id="abb20">SVM</term><def><p>support vector machine</p></def></def-item><def-item><term id="abb21">TF-IDF</term><def><p>term frequency&#x2014;inverse document frequency</p></def></def-item><def-item><term id="abb22">VTE</term><def><p>venous thromboembolism</p></def></def-item><def-item><term id="abb23">VTExt</term><def><p>venous thromboembolism symptom extractor</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beckman</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Hooper</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Critchley</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Ortel</surname><given-names>TL</given-names> </name></person-group><article-title>Venous thromboembolism: a public health concern</article-title><source>Am J Prev Med</source><year>2010</year><month>04</month><volume>38</volume><issue>4 Suppl</issue><fpage>S495</fpage><lpage>501</lpage><pub-id pub-id-type="doi">10.1016/j.amepre.2009.12.017</pub-id><pub-id pub-id-type="medline">20331949</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bruni-Fitzgerald</surname><given-names>KR</given-names> </name></person-group><article-title>Venous thromboembolism: An overview</article-title><source>J Vasc Nurs</source><year>2015</year><month>09</month><volume>33</volume><issue>3</issue><fpage>95</fpage><lpage>99</lpage><pub-id pub-id-type="doi">10.1016/j.jvn.2015.02.001</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kahn</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Ginsberg</surname><given-names>JS</given-names> </name></person-group><article-title>Relationship between deep venous thrombosis and the postthrombotic syndrome</article-title><source>Arch Intern Med</source><year>2004</year><month>01</month><day>12</day><volume>164</volume><issue>1</issue><fpage>17</fpage><lpage>26</lpage><pub-id pub-id-type="doi">10.1001/archinte.164.1.17</pub-id><pub-id pub-id-type="medline">14718318</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tagalakis</surname><given-names>V</given-names> </name><name name-style="western"><surname>Patenaude</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kahn</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Suissa</surname><given-names>S</given-names> </name></person-group><article-title>Incidence of and mortality from venous thromboembolism in a real-world population: the Q-VTE study cohort</article-title><source>Am J Med</source><year>2013</year><month>09</month><volume>126</volume><issue>9</issue><fpage>832</fpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2013.02.024</pub-id><pub-id pub-id-type="medline">23830539</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ruppert</surname><given-names>A</given-names> </name><name name-style="western"><surname>Steinle</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lees</surname><given-names>M</given-names> </name></person-group><article-title>Economic burden of venous thromboembolism: a systematic review</article-title><source>J Med Econ</source><year>2011</year><volume>14</volume><issue>1</issue><fpage>65</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.3111/13696998.2010.546465</pub-id><pub-id pub-id-type="medline">21222564</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heit</surname><given-names>JA</given-names> </name></person-group><article-title>Epidemiology of venous thromboembolism</article-title><source>Nat Rev Cardiol</source><year>2015</year><month>08</month><volume>12</volume><issue>8</issue><fpage>464</fpage><lpage>474</lpage><pub-id pub-id-type="doi">10.1038/nrcardio.2015.83</pub-id><pub-id pub-id-type="medline">26076949</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anderson</surname><given-names>FA</given-names>  <suffix>Jr</suffix></name><name name-style="western"><surname>Wheeler</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Goldberg</surname><given-names>RJ</given-names> </name><etal/></person-group><article-title>A population-based perspective of the hospital incidence and case-fatality rates of deep vein thrombosis and pulmonary embolism. The Worcester DVT Study</article-title><source>Arch Intern Med</source><year>1991</year><month>05</month><volume>151</volume><issue>5</issue><fpage>933</fpage><lpage>938</lpage><pub-id pub-id-type="medline">2025141</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hansson</surname><given-names>PO</given-names> </name><name name-style="western"><surname>Welin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tibblin</surname><given-names>G</given-names> </name><name name-style="western"><surname>Eriksson</surname><given-names>H</given-names> </name></person-group><article-title>Deep vein thrombosis and pulmonary embolism in the general population. &#x201C;The Study of Men Born in 1913&#x201D;</article-title><source>Arch Intern Med</source><year>1997</year><volume>157</volume><issue>15</issue><fpage>1665</fpage><lpage>1670</lpage><pub-id pub-id-type="medline">9250227</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Silverstein</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Heit</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Mohr</surname><given-names>DN</given-names> </name><name name-style="western"><surname>Petterson</surname><given-names>TM</given-names> </name><name name-style="western"><surname>O&#x2019;Fallon</surname><given-names>WM</given-names> </name><name name-style="western"><surname>Melton</surname><given-names>LJ</given-names>  <suffix>3rd</suffix></name></person-group><article-title>Trends in the incidence of deep vein thrombosis and pulmonary embolism: a 25-year population-based study</article-title><source>Arch Intern Med</source><year>1998</year><month>03</month><day>23</day><volume>158</volume><issue>6</issue><fpage>585</fpage><lpage>593</lpage><pub-id pub-id-type="doi">10.1001/archinte.158.6.585</pub-id><pub-id pub-id-type="medline">9521222</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oger</surname><given-names>E</given-names> </name><collab>the EPI-GETBO Study Group</collab></person-group><article-title>Incidence of venous thromboembolism: a community-based study in western France</article-title><source>Thromb Haemost</source><year>2000</year><volume>83</volume><issue>5</issue><fpage>657</fpage><lpage>660</lpage><pub-id pub-id-type="doi">10.1055/s-0037-1613887</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cushman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>AW</given-names> </name><name name-style="western"><surname>White</surname><given-names>RH</given-names> </name><etal/></person-group><article-title>Deep vein thrombosis and pulmonary embolism in two cohorts: the longitudinal investigation of thromboembolism etiology</article-title><source>Am J Med</source><year>2004</year><month>07</month><day>1</day><volume>117</volume><issue>1</issue><fpage>19</fpage><lpage>25</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2004.01.018</pub-id><pub-id pub-id-type="medline">15210384</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heit</surname><given-names>JA</given-names> </name></person-group><article-title>Venous thromboembolism: disease burden, outcomes and risk factors</article-title><source>J Thromb Haemost</source><year>2005</year><month>08</month><volume>3</volume><issue>8</issue><fpage>1611</fpage><lpage>1617</lpage><pub-id pub-id-type="doi">10.1111/j.1538-7836.2005.01415.x</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spencer</surname><given-names>FA</given-names> </name><name name-style="western"><surname>Emery</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lessard</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The Worcester Venous Thromboembolism study: a population-based study of the clinical epidemiology of venous thromboembolism</article-title><source>J Gen Intern Med</source><year>2006</year><month>07</month><volume>21</volume><issue>7</issue><fpage>722</fpage><lpage>727</lpage><pub-id pub-id-type="doi">10.1111/j.1525-1497.2006.00458.x</pub-id><pub-id pub-id-type="medline">16808773</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>N&#x00E6;ss</surname><given-names>IA</given-names> </name><name name-style="western"><surname>Christiansen</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Romundstad</surname><given-names>P</given-names> </name><name name-style="western"><surname>Cannegieter</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Rosendaal</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Hammerstr&#x00F8;m</surname><given-names>J</given-names> </name></person-group><article-title>Incidence and mortality of venous thrombosis: a population&#x2010;based study</article-title><source>J Thromb Haemost</source><year>2007</year><month>04</month><volume>5</volume><issue>4</issue><fpage>692</fpage><lpage>699</lpage><pub-id pub-id-type="doi">10.1111/j.1538-7836.2007.02450.x</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spencer</surname><given-names>FA</given-names> </name><name name-style="western"><surname>Emery</surname><given-names>C</given-names> </name><name name-style="western"><surname>Joffe</surname><given-names>SW</given-names> </name><etal/></person-group><article-title>Incidence rates, clinical profile, and outcomes of patients with venous thromboembolism. The Worcester VTE study</article-title><source>J Thromb Thrombolysis</source><year>2009</year><month>11</month><volume>28</volume><issue>4</issue><fpage>401</fpage><lpage>409</lpage><pub-id pub-id-type="doi">10.1007/s11239-009-0378-3</pub-id><pub-id pub-id-type="medline">19629642</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Goldberg</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Anderson</surname><given-names>FA</given-names> </name><name name-style="western"><surname>Kiefe</surname><given-names>CI</given-names> </name><name name-style="western"><surname>Spencer</surname><given-names>FA</given-names> </name></person-group><article-title>Secular trends in occurrence of acute venous thromboembolism: the Worcester VTE study (1985-2009)</article-title><source>Am J Med</source><year>2014</year><month>09</month><volume>127</volume><issue>9</issue><fpage>829</fpage><lpage>39</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2014.03.041</pub-id><pub-id pub-id-type="medline">24813864</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>White</surname><given-names>RH</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Romano</surname><given-names>PS</given-names> </name></person-group><article-title>Incidence of idiopathic deep venous thrombosis and secondary thromboembolism among ethnic groups in California</article-title><source>Ann Intern Med</source><year>1998</year><month>05</month><day>1</day><volume>128</volume><issue>9</issue><fpage>737</fpage><lpage>740</lpage><pub-id pub-id-type="doi">10.7326/0003-4819-128-9-199805010-00006</pub-id><pub-id pub-id-type="medline">9556467</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schneider</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lilienfeld</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Im</surname><given-names>W</given-names> </name></person-group><article-title>The epidemiology of pulmonary embolism: racial contrasts in incidence and in-hospital case fatality</article-title><source>J Natl Med Assoc</source><year>2006</year><month>12</month><volume>98</volume><issue>12</issue><fpage>1967</fpage><lpage>1972</lpage><pub-id pub-id-type="medline">17225843</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zakai</surname><given-names>NA</given-names> </name><name name-style="western"><surname>McClure</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Judd</surname><given-names>SE</given-names> </name><etal/></person-group><article-title>Racial and regional differences in venous thromboembolism in the United States in 3 cohorts</article-title><source>Circulation</source><year>2014</year><month>04</month><day>8</day><volume>129</volume><issue>14</issue><fpage>1502</fpage><lpage>1509</lpage><pub-id pub-id-type="doi">10.1161/CIRCULATIONAHA.113.006472</pub-id><pub-id pub-id-type="medline">24508826</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheuk</surname><given-names>BLY</given-names> </name><name name-style="western"><surname>Cheung</surname><given-names>GCY</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>SWK</given-names> </name></person-group><article-title>Epidemiology of venous thromboembolism in a Chinese population</article-title><source>Br J Surg</source><year>2004</year><month>04</month><volume>91</volume><issue>4</issue><fpage>424</fpage><lpage>428</lpage><pub-id pub-id-type="doi">10.1002/bjs.4454</pub-id><pub-id pub-id-type="medline">15048741</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klatsky</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Armstrong</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Poggi</surname><given-names>J</given-names> </name></person-group><article-title>Risk of pulmonary embolism and/or deep venous thrombosis in Asian-Americans</article-title><source>Am J Cardiol</source><year>2000</year><month>06</month><day>1</day><volume>85</volume><issue>11</issue><fpage>1334</fpage><lpage>1337</lpage><pub-id pub-id-type="doi">10.1016/s0002-9149(00)00766-9</pub-id><pub-id pub-id-type="medline">10831950</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>White</surname><given-names>RH</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Murin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Harvey</surname><given-names>D</given-names> </name></person-group><article-title>Effect of ethnicity and gender on the incidence of venous thromboembolism in a diverse population in California in 1996</article-title><source>Thromb Haemost</source><year>2005</year><month>02</month><volume>93</volume><issue>2</issue><fpage>298</fpage><lpage>305</lpage><pub-id pub-id-type="doi">10.1160/TH04-08-0506</pub-id><pub-id pub-id-type="medline">15711746</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hooper</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Holman</surname><given-names>RC</given-names> </name><name name-style="western"><surname>Heit</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Cobb</surname><given-names>N</given-names> </name></person-group><article-title>Venous thromboembolism hospitalizations among American Indians and Alaska Natives</article-title><source>Thromb Res</source><year>2002</year><month>12</month><day>15</day><volume>108</volume><issue>5-6</issue><fpage>273</fpage><lpage>278</lpage><pub-id pub-id-type="doi">10.1016/s0049-3848(03)00058-6</pub-id><pub-id pub-id-type="medline">12676185</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>J&#x00F8;rgensen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Horv&#x00E1;th-Puh&#x00F3;</surname><given-names>E</given-names> </name><name name-style="western"><surname>Laugesen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Braekkan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hansen</surname><given-names>JB</given-names> </name><name name-style="western"><surname>S&#x00F8;rensen</surname><given-names>HT</given-names> </name></person-group><article-title>Socioeconomic status and risk of incident venous thromboembolism</article-title><source>J Thromb Haemost</source><year>2021</year><month>12</month><volume>19</volume><issue>12</issue><fpage>3051</fpage><lpage>3061</lpage><pub-id pub-id-type="doi">10.1111/jth.15523</pub-id><pub-id pub-id-type="medline">34498381</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alikhan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Combe</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Risk factors for venous thromboembolism in hospitalized patients with acute medical illness: analysis of the MEDENOX Study</article-title><source>Arch Intern Med</source><year>2004</year><month>05</month><day>10</day><volume>164</volume><issue>9</issue><fpage>963</fpage><lpage>968</lpage><pub-id pub-id-type="doi">10.1001/archinte.164.9.963</pub-id><pub-id pub-id-type="medline">15136304</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khan</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rahman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Carrier</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Long term risk of symptomatic recurrent venous thromboembolism after discontinuation of anticoagulant treatment for first unprovoked venous thromboembolism event: systematic review and meta-analysis</article-title><source>BMJ</source><year>2019</year><month>07</month><day>24</day><volume>366</volume><fpage>l4363</fpage><pub-id pub-id-type="doi">10.1136/bmj.l4363</pub-id><pub-id pub-id-type="medline">31340984</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Albertsen</surname><given-names>IE</given-names> </name><name name-style="western"><surname>Nielsen</surname><given-names>PB</given-names> </name><name name-style="western"><surname>S&#x00F8;gaard</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Risk of recurrent venous thromboembolism: a Danish nationwide cohort study</article-title><source>Am J Med</source><year>2018</year><month>09</month><volume>131</volume><issue>9</issue><fpage>1067</fpage><lpage>1074</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2018.04.042</pub-id><pub-id pub-id-type="medline">30266273</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Campello</surname><given-names>E</given-names> </name><name name-style="western"><surname>Prandoni</surname><given-names>P</given-names> </name></person-group><article-title>Evolving knowledge on primary and secondary prevention of venous thromboembolism in carriers of hereditary thrombophilia: a narrative review</article-title><source>Semin Thromb Hemost</source><year>2022</year><month>11</month><volume>48</volume><issue>8</issue><fpage>937</fpage><lpage>948</lpage><pub-id pub-id-type="doi">10.1055/s-0042-1753527</pub-id><pub-id pub-id-type="medline">36055262</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ageno</surname><given-names>W</given-names> </name><name name-style="western"><surname>Agnelli</surname><given-names>G</given-names> </name><name name-style="western"><surname>Imberti</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Factors associated with the timing of diagnosis of venous thromboembolism: results from the MASTER registry</article-title><source>Thromb Res</source><year>2008</year><volume>121</volume><issue>6</issue><fpage>751</fpage><lpage>756</lpage><pub-id pub-id-type="doi">10.1016/j.thromres.2007.08.009</pub-id><pub-id pub-id-type="medline">17920107</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pellathy</surname><given-names>T</given-names> </name><name name-style="western"><surname>Saul</surname><given-names>M</given-names> </name><name name-style="western"><surname>Clermont</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dubrawski</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Pinsky</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Hravnak</surname><given-names>M</given-names> </name></person-group><article-title>Accuracy of identifying hospital acquired venous thromboembolism by administrative coding: implications for big data and machine learning research</article-title><source>J Clin Monit Comput</source><year>2022</year><month>04</month><volume>36</volume><issue>2</issue><fpage>397</fpage><lpage>405</lpage><pub-id pub-id-type="doi">10.1007/s10877-021-00664-6</pub-id><pub-id pub-id-type="medline">33558981</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>Office of the Surgeon General (US)</collab><collab>National Heart, Lung, and Blood Institute (US)</collab></person-group><source>The Surgeon General&#x2019;s Call to Action to Prevent Deep Vein Thrombosis and Pulmonary Embolism</source><year>2008</year><access-date>2022-10-11</access-date><publisher-name>Office of the Surgeon General (US)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK44178/">https://www.ncbi.nlm.nih.gov/books/NBK44178/</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raskob</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Silverstein</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bratzler</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Heit</surname><given-names>JA</given-names> </name><name name-style="western"><surname>White</surname><given-names>RH</given-names> </name></person-group><article-title>Surveillance for deep vein thrombosis and pulmonary embolism: recommendations from a national workshop</article-title><source>Am J Prev Med</source><year>2010</year><month>04</month><volume>38</volume><issue>4 Suppl</issue><fpage>S502</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1016/j.amepre.2010.01.010</pub-id><pub-id pub-id-type="medline">20331950</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dalen</surname><given-names>JE</given-names> </name></person-group><article-title>Pulmonary embolism: what have we learned since Virchow? Natural history, pathophysiology, and diagnosis</article-title><source>Chest</source><year>2002</year><month>10</month><volume>122</volume><issue>4</issue><fpage>1440</fpage><lpage>1456</lpage><pub-id pub-id-type="doi">10.1378/chest.122.4.1440</pub-id><pub-id pub-id-type="medline">12377877</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ozsu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Oztuna</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bulbul</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>The role of risk factors in delayed diagnosis of pulmonary embolism</article-title><source>Am J Emerg Med</source><year>2011</year><month>01</month><volume>29</volume><issue>1</issue><fpage>26</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2009.07.005</pub-id><pub-id pub-id-type="medline">20825770</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burde</surname><given-names>H</given-names> </name></person-group><article-title>Health Law the hitech act-an overview</article-title><source>Virtual Mentor</source><year>2011</year><month>03</month><day>1</day><volume>13</volume><issue>3</issue><fpage>172</fpage><lpage>175</lpage><pub-id pub-id-type="doi">10.1001/virtualmentor.2011.13.3.hlaw1-1103</pub-id><pub-id pub-id-type="medline">23127320</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Charles</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gabriel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Searcy</surname><given-names>T</given-names> </name></person-group><article-title>Adoption of electronic health record systems among U.S. non-federal acute care hospitals: 2008-2014</article-title><year>2015</year><access-date>2025-08-06</access-date><fpage>10</fpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.healthit.gov/sites/default/files/data-brief/2014HospitalAdoptionDataBrief.pdf">https://www.healthit.gov/sites/default/files/data-brief/2014HospitalAdoptionDataBrief.pdf</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Martin-Sanchez</surname><given-names>F</given-names> </name><name name-style="western"><surname>Verspoor</surname><given-names>K</given-names> </name></person-group><article-title>Big data in medicine is driving big changes</article-title><source>Yearb Med Inform</source><year>2014</year><month>08</month><day>15</day><volume>9</volume><issue>1</issue><fpage>14</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.15265/IY-2014-0020</pub-id><pub-id pub-id-type="medline">25123716</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Steinkamp</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>TS</given-names> </name></person-group><article-title>Basic artificial intelligence techniques: natural language processing of radiology reports</article-title><source>Radiol Clin North Am</source><year>2021</year><month>11</month><volume>59</volume><issue>6</issue><fpage>919</fpage><lpage>931</lpage><pub-id pub-id-type="doi">10.1016/j.rcl.2021.06.003</pub-id><pub-id pub-id-type="medline">34689877</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>I</given-names> </name><name name-style="western"><surname>Henry</surname><given-names>AS</given-names> </name><etal/></person-group><article-title>Natural language processing to identify cancer treatments with electronic medical records</article-title><source>JCO Clin Cancer Inform</source><year>2021</year><month>04</month><volume>5</volume><fpage>379</fpage><lpage>393</lpage><pub-id pub-id-type="doi">10.1200/CCI.20.00173</pub-id><pub-id pub-id-type="medline">33822653</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savova</surname><given-names>GK</given-names> </name><name name-style="western"><surname>Danciu</surname><given-names>I</given-names> </name><name name-style="western"><surname>Alamudun</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Use of natural language processing to extract clinical cancer phenotypes from electronic medical records</article-title><source>Cancer Res</source><year>2019</year><month>11</month><day>1</day><volume>79</volume><issue>21</issue><fpage>5463</fpage><lpage>5470</lpage><pub-id pub-id-type="doi">10.1158/0008-5472.CAN-19-0579</pub-id><pub-id pub-id-type="medline">31395609</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Egleston</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Bai</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bleicher</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Lutz</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Vucetic</surname><given-names>S</given-names> </name></person-group><article-title>Statistical inference for natural language processing algorithms with a demonstration using type 2 diabetes prediction from electronic health record notes</article-title><source>Biometrics</source><year>2021</year><month>09</month><volume>77</volume><issue>3</issue><fpage>1089</fpage><lpage>1100</lpage><pub-id pub-id-type="doi">10.1111/biom.13338</pub-id><pub-id pub-id-type="medline">32700317</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Riddick</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Choo</surname><given-names>EK</given-names> </name></person-group><article-title>Natural language processing to identify substance misuse in the electronic health record</article-title><source>Lancet Digit Health</source><year>2022</year><month>06</month><volume>4</volume><issue>6</issue><fpage>e401</fpage><lpage>e402</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(22)00096-6</pub-id><pub-id pub-id-type="medline">35623795</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Signor</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Lappe</surname><given-names>KL</given-names> </name><etal/></person-group><article-title>A comparison of natural language processing to ICD-10 codes for identification and characterization of pulmonary embolism</article-title><source>Thromb Res</source><year>2021</year><month>07</month><volume>203</volume><fpage>190</fpage><lpage>195</lpage><pub-id pub-id-type="doi">10.1016/j.thromres.2021.04.020</pub-id><pub-id pub-id-type="medline">34044246</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hurdle</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>SA</given-names> </name><etal/></person-group><article-title>Natural language processing for the surveillance of postoperative venous thromboembolism</article-title><source>Surgery</source><year>2021</year><month>10</month><volume>170</volume><issue>4</issue><fpage>1175</fpage><lpage>1182</lpage><pub-id pub-id-type="doi">10.1016/j.surg.2021.04.027</pub-id><pub-id pub-id-type="medline">34090671</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kreimeyer</surname><given-names>K</given-names> </name><name name-style="western"><surname>Foster</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Natural language processing systems for capturing and standardizing unstructured clinical information: a systematic review</article-title><source>J Biomed Inform</source><year>2017</year><month>09</month><volume>73</volume><fpage>14</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2017.07.012</pub-id><pub-id pub-id-type="medline">28729030</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chakrabortty</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>KP</given-names> </name><etal/></person-group><article-title>Surrogate-assisted feature extraction for high-throughput phenotyping</article-title><source>J Am Med Inform Assoc</source><year>2017</year><month>04</month><day>1</day><volume>24</volume><issue>e1</issue><fpage>e143</fpage><lpage>e149</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocw135</pub-id><pub-id pub-id-type="medline">27632993</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gronsbell</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Enabling phenotypic big data with PheNorm</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>01</month><day>1</day><volume>25</volume><issue>1</issue><fpage>54</fpage><lpage>60</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocx111</pub-id><pub-id pub-id-type="medline">29126253</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koleck</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Dreisbach</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bourne</surname><given-names>PE</given-names> </name><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name></person-group><article-title>Natural language processing of symptoms documented in free-text narratives of electronic health records: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2019</year><month>04</month><day>1</day><volume>26</volume><issue>4</issue><fpage>364</fpage><lpage>379</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy173</pub-id><pub-id pub-id-type="medline">30726935</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zaharia</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name></person-group><article-title>FrugalGPT: how to use large language models while reducing cost and improving performance</article-title><source>arXiv</source><comment>Preprint posted online on  May 9, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.05176</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kluegl</surname><given-names>P</given-names> </name><name name-style="western"><surname>Toepfer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Beck</surname><given-names>PD</given-names> </name><name name-style="western"><surname>Fette</surname><given-names>G</given-names> </name><name name-style="western"><surname>Puppe</surname><given-names>F</given-names> </name></person-group><article-title>UIMA Ruta: Rapid development of rule-based information extraction applications</article-title><source>Nat Lang Eng</source><year>2016</year><month>01</month><volume>22</volume><issue>1</issue><fpage>1</fpage><lpage>40</lpage><pub-id pub-id-type="doi">10.1017/S1351324914000114</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chapman</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>HP</given-names> </name><name name-style="western"><surname>Chapman</surname><given-names>WW</given-names> </name></person-group><article-title>Document-level classification of CT pulmonary angiography reports based on an extension of the ConText algorithm</article-title><source>J Biomed Inform</source><year>2011</year><month>10</month><volume>44</volume><issue>5</issue><fpage>728</fpage><lpage>737</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2011.03.011</pub-id><pub-id pub-id-type="medline">21459155</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Syrowatka</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pullman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pajares</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Accurately identifying incident cases of venous thromboembolism in the electronic health record: Performance of a novel phenotyping algorithm</article-title><source>Thromb Res</source><year>2024</year><month>11</month><volume>243</volume><fpage>109143</fpage><pub-id pub-id-type="doi">10.1016/j.thromres.2024.109143</pub-id><pub-id pub-id-type="medline">39303403</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dykes</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Bowen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Testing of an electronic clinical quality measure for diagnostic delay of venous thromboembolism (DOVE) in primary care</article-title><source>AMIA Annu Symp Proc</source><year>2023</year><volume>2023</volume><fpage>339</fpage><lpage>348</lpage><pub-id pub-id-type="medline">38222335</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Plasek</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Mahoney</surname><given-names>LM</given-names> </name><etal/></person-group><article-title>Using medical text extraction, reasoning and mapping system (MTERMS) to process medication information in outpatient clinical notes</article-title><source>AMIA Annu Symp Proc</source><year>2011</year><volume>2011</volume><fpage>1639</fpage><lpage>1648</lpage><pub-id pub-id-type="medline">22195230</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Le Glaz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Haralambous</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim-Dufor</surname><given-names>DH</given-names> </name><etal/></person-group><article-title>Machine learning and natural language processing in mental health: systematic review</article-title><source>J Med Internet Res</source><year>2021</year><month>05</month><day>4</day><volume>23</volume><issue>5</issue><fpage>e15708</fpage><pub-id pub-id-type="doi">10.2196/15708</pub-id><pub-id pub-id-type="medline">33944788</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="web"><article-title>Dove_vtext</article-title><source>GitHub</source><access-date>2025-08-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/jnlaurentiev/dove_vtext">https://github.com/jnlaurentiev/dove_vtext</ext-link></comment></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Alsentzer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Boag</surname><given-names>W</given-names> </name></person-group><article-title>Publicly available clinical BERT embeddings</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 20, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1904.03323</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-III, a freely accessible critical care database</article-title><source>Sci Data</source><year>2016</year><month>05</month><day>24</day><volume>3</volume><issue>1</issue><fpage>160035</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id><pub-id pub-id-type="medline">27219127</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wolf</surname><given-names>T</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanh</surname><given-names>V</given-names> </name><etal/></person-group><article-title>HuggingFace&#x2019;s transformers: state-of-the-art natural language processing</article-title><comment>Preprint posted online on  Jul 13, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.03771</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Random forests</article-title><source>Mach Learn</source><year>2001</year><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pedregosa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gramfort</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Scikit-learn: machine learning in python</article-title><source>J Mach Learn Res</source><year>2011</year><volume>12</volume><issue>85</issue><fpage>2825</fpage><lpage>2830</lpage><pub-id pub-id-type="doi">10.5555/1953048.2078195</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><year>2016</year><conf-name>22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><conf-loc>San Francisco, CA</conf-loc><publisher-name>ACM</publisher-name><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="web"><source>Tf-idf weighting</source><access-date>2022-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html">https://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html</ext-link></comment></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="web"><article-title>Diagnostic delay of venous thromboembolism (DOVE) in primary care</article-title><source>Partnership for Quality Measurement</source><access-date>2024-01-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://p4qm.org/measures/3749e">https://p4qm.org/measures/3749e</ext-link></comment></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sabra</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mahmood Malik</surname><given-names>K</given-names> </name><name name-style="western"><surname>Alobaidi</surname><given-names>M</given-names> </name></person-group><article-title>Prediction of venous thromboembolism using semantic and sentiment analyses of clinical narratives</article-title><source>Comput Biol Med</source><year>2018</year><month>03</month><day>1</day><volume>94</volume><fpage>1</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2017.12.026</pub-id><pub-id pub-id-type="medline">29353160</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>ZG</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tai</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>YT</given-names> </name></person-group><article-title>Natural language processing in a clinical decision support system for the identification of venous thromboembolism: algorithm development and validation</article-title><source>J Med Internet Res</source><year>2023</year><month>04</month><day>24</day><volume>25</volume><issue>1</issue><fpage>e43153</fpage><pub-id pub-id-type="doi">10.2196/43153</pub-id><pub-id pub-id-type="medline">37093636</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Le Guellec</surname><given-names>B</given-names> </name><name name-style="western"><surname>Lef&#x00E8;vre</surname><given-names>A</given-names> </name><name name-style="western"><surname>Geay</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Performance of an open-source large language model in extracting information from free-text radiology reports</article-title><source>Radiol Artif Intell</source><year>2024</year><month>07</month><volume>6</volume><issue>4</issue><fpage>e230364</fpage><pub-id pub-id-type="doi">10.1148/ryai.230364</pub-id><pub-id pub-id-type="medline">38717292</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Venous thromboembolism symptom lexicon.</p><media xlink:href="medinform_v13i1e63720_app1.docx" xlink:title="DOCX File, 13 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Inclusion criteria <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) and RxNorm codes.</p><media xlink:href="medinform_v13i1e63720_app2.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Symptom prevalence, patient note level.</p><media xlink:href="medinform_v13i1e63720_app3.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Deep learning and machine learning model parameters.</p><media xlink:href="medinform_v13i1e63720_app4.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material></app-group></back></article>