<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><?covid-19-tdm?><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn></journal-meta><article-meta><article-id pub-id-type="publisher-id">46267</article-id><article-id pub-id-type="doi">10.2196/46267</article-id><title-group><article-title>Comparing Natural Language Processing and Structured Medical Data to Develop a Computable Phenotype for Patients Hospitalized Due to COVID-19: Retrospective Analysis</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Chang</surname><given-names>Feier</given-names></name><degrees>BA, MB</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Krishnan</surname><given-names>Jay</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hurst</surname><given-names>Jillian H</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yarrington</surname><given-names>Michael E</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Anderson</surname><given-names>Deverick J</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>O'Brien</surname><given-names>Emily C</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Goldstein</surname><given-names>Benjamin A</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biostatistics and Bioinformatics, Duke University</institution>, <addr-line>Durham</addr-line><addr-line>NC</addr-line>, <country>United States</country></aff><aff id="aff2"><institution>Department of Medicine, Duke University</institution>, <addr-line>Durham</addr-line><addr-line>NC</addr-line>, <country>United States</country></aff><aff id="aff3"><institution>Department of Pediatrics, Duke University</institution>, <addr-line>Durham</addr-line><addr-line>NC</addr-line>, <country>United States</country></aff><aff id="aff4"><institution>Department of Population Health Sciences, Duke University</institution>, <addr-line>Durham</addr-line><addr-line>NC</addr-line>, <country>United States</country></aff><aff id="aff5"><institution>Duke Clinical Research Institute, Duke University</institution>, <addr-line>Durham</addr-line><addr-line>NC</addr-line>, <country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chatzimina</surname><given-names>Maria Evangelia</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>Qingyu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ip</surname><given-names>Wui</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Benjamin A Goldstein, PhD<email>ben.goldstein@duke.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2023</year></pub-date><pub-date pub-type="epub"><day>22</day><month>8</month><year>2023</year></pub-date><volume>11</volume><elocation-id>e46267</elocation-id><history><date date-type="received"><day>07</day><month>02</month><year>2023</year></date><date date-type="rev-recd"><day>19</day><month>05</month><year>2023</year></date><date date-type="accepted"><day>17</day><month>06</month><year>2023</year></date></history><copyright-statement>&#x00A9; Feier Chang, Jay Krishnan, Jillian H Hurst, Michael E Yarrington, Deverick J Anderson, Emily C O'Brien, Benjamin A Goldstein. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 22.8.2023. </copyright-statement><copyright-year>2023</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2023/1/e46267"/><abstract><sec><title>Background</title><p>Throughout the COVID-19 pandemic, many hospitals conducted routine testing of hospitalized patients for SARS-CoV-2 infection upon admission. Some of these patients are admitted for reasons unrelated to COVID-19 and incidentally test positive for the virus. Because COVID-19&#x2013;related hospitalizations have become a critical public health indicator, it is important to identify patients who are hospitalized because of COVID-19 as opposed to those who are admitted for other indications.</p></sec><sec><title>Objective</title><p>We compared the performance of different computable phenotype definitions for COVID-19 hospitalizations that use different types of data from electronic health records (EHRs), including structured EHR data elements, clinical notes, or a combination of both data types.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted a retrospective data analysis, using clinician chart review&#x2013;based validation at a large academic medical center. We reviewed and analyzed the charts of 586 hospitalized individuals who tested positive for SARS-CoV-2 in January 2022. We used LASSO (least absolute shrinkage and selection operator) regression and random forests to fit classification algorithms that incorporated structured EHR data elements, clinical notes, or a combination of structured data and clinical notes. We used natural language processing to incorporate data from clinical notes. The performance of each model was evaluated based on the area under the receiver operator characteristic curve (AUROC) and an associated decision rule based on sensitivity and positive predictive value. We also identified top words and clinical indicators of COVID-19&#x2013;specific hospitalization and assessed the impact of different phenotyping strategies on estimated hospital outcome metrics.</p></sec><sec sec-type="results"><title>Results</title><p>Based on a chart review, 38.2% (224/586) of patients were determined to have been hospitalized for reasons other than COVID-19, despite having tested positive for SARS-CoV-2. A computable phenotype that used clinical notes had significantly better discrimination than one that used structured EHR data elements (AUROC: 0.894 vs 0.841; <italic>P</italic>&#x003C;.001) and performed similarly to a model that combined clinical notes with structured data elements (AUROC: 0.894 vs 0.893; <italic>P</italic>=.91). Assessments of hospital outcome metrics significantly differed based on whether the population included all hospitalized patients who tested positive for SARS-CoV-2 or those who were determined to have been hospitalized due to COVID-19.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>These findings highlight the importance of cause-specific phenotyping for COVID-19 hospitalizations. More generally, this work demonstrates the utility of natural language processing approaches for deriving information related to patient hospitalizations in cases where there may be multiple conditions that could serve as the primary indication for hospitalization.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>NLP</kwd><kwd>computable phenotype</kwd><kwd>machine learning</kwd><kwd>COVID</kwd><kwd>coronavirus</kwd><kwd>hospitalize</kwd><kwd>hospitalization</kwd><kwd>electronic health record</kwd><kwd>EHR</kwd><kwd>health record</kwd><kwd>structured data</kwd><kwd>data element</kwd><kwd>free text</kwd><kwd>unstructured data</kwd><kwd>provider note</kwd><kwd>classify</kwd><kwd>classification</kwd><kwd>algorithm</kwd><kwd>COVID-19</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Hospitalization due to COVID-19 has become a key public health indicator. One of the primary goals of vaccination against SARS-CoV-2, the etiological agent of COVID-19, is to reduce the incidence of severe disease and death, with hospitalization serving as a primary end point in vaccine efficacy trials [<xref ref-type="bibr" rid="ref1">1</xref>]. Further, hospitalization has become a primary indicator of community transmission levels of SARS-CoV-2 infection [<xref ref-type="bibr" rid="ref2">2</xref>], including disease severity and health system capacity [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. Similarly, hospitalization due to COVID-19 is a typical outcome of interest in public health studies of COVID-19 using real-world data sources, such as electronic health record (EHR) data [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. Finally, because of the rise of rapid, at-home testing for SARS-CoV-2 infection, COVID-19 cases that do not rise to the level of requiring medical attention are likely to be missed or underreported, affecting assessments of COVID-19 prevalence [<xref ref-type="bibr" rid="ref11">11</xref>]. Thus, there is a critical need to rapidly and accurately identify hospitalizations due to COVID-19.</p><p>Due to concerns related to the hospital-based spread of SARS-CoV-2, many institutions routinely perform SARS-CoV-2 testing in patients who are admitted to the hospital, regardless of the primary reason for admission [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Although SARS-CoV-2 testing is important for guiding care and ensuring that health care professionals take precautions to prevent infection, such routine testing potentially complicates retrospective studies using real-world data sources. Specifically, it becomes challenging to distinguish a patient who was admitted because of COVID-19 from a patient who incidentally tested positive for SARS-CoV-2 infection. In both cases, patients would have a positive laboratory test result and would (presumably) have an <italic>International Classification of Diseases, 10th Revision</italic> (<italic>ICD-10</italic>) code for COVID-19. Previous reports have noted that incidental positives may account for around 26% of all COVID-19&#x2013;positive patients [<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>Given the public health importance of identifying hospitalizations due to COVID-19 rather than hospitalizations in which SARS-CoV-2 infection was identified incidentally, methods (ie, computable phenotypes) are needed to distinguish the two conditions in retrospective data sources. Such phenotypes would be instrumental in retrospective studies of patients with COVID-19 and in public health surveillance. In this study, we seek to (1) motivate the need to identify patients who were admitted because of COVID-19 versus patients who incidentally tested positive for SARS-CoV-2 during admission, (2) explore the potential of using both structured data (ie, diagnosis codes, medications, and procedure codes) and unstructured data (ie, clinical notes) to construct computable phenotypes, and (3) illustrate the inferential biases that may arise if phenotyping methods cannot distinguish the reason for hospitalization.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Setting</title><p>We performed a retrospective study of patients aged &#x003E;18 years who were hospitalized with a documented positive SARS-CoV-2 test result during January 2022. We conducted our study at Duke University Health System (DUHS), which consists of 1 quaternary academic medical center and 2 associated community-based hospitals.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was designated as exempt human subjects research by the DUHS Institutional Review Board (IRB number: Pro00109397).</p></sec><sec id="s2-3"><title>Study Data</title><sec id="s2-3-1"><title>Source Data</title><p>Using DUHS EHR data, we identified all patients who were admitted during the week of January 16 to 22, 2022, with documentation of a positive SARS-CoV-2 test result in the prior 20 days. Charts from this week were specifically reviewed in part due to a data request from the North Carolina Division of Public Health to understand the epidemiology of COVID-19&#x2013;related hospitalizations. We excluded individuals with a resolved COVID-19 isolation status, as well as those who were admitted prior to January 1, 2022, to create a cohort of patients who were likely infected with the Omicron variant of SARS-CoV-2. During this period, the Omicron variant was the predominant SARS-CoV-2 variant in circulation within the United States and was associated with the largest wave [<xref ref-type="bibr" rid="ref8">8</xref>] of SARS-CoV-2 infections to date. For each patient, we extracted the following data: medical record number, date of admission, hospital unit, and level of care.</p><p>To generate a criterion standard for classification, 6 trained health care professionals manually reviewed patient records for the index admission to adjudicate whether SARS-CoV-2 infection was the primary reason for admission or an incidental finding. Health care professionals attributed hospitalizations as those due to COVID-19 if admissions were due to primary manifestations of SARS-CoV-2 infection, such as hypoxia or the need for supplemental oxygen, or due to COVID-19&#x2013;associated complications, such as dehydration or weakness.</p></sec><sec id="s2-3-2"><title>Analytic Data</title><p>For each admission reviewed, we extracted structured EHR data elements recorded during hospitalization and captured within the Duke Clinical Research Datamart&#x2014;an EHR database that is based on an extension of the PCORnet Common Data Model (National Patient-Centered Clinical Research Network) [<xref ref-type="bibr" rid="ref15">15</xref>]. Clinical notes were extracted from the Duke University Electronic Data Warehouse. We extracted admission data, daily progress data, and discharge summary notes. Extracted structured data elements included demographics, service encounter characteristics, diagnoses, laboratory tests, COVID-19 vaccination status, and medications (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Clinical notes included emergency department admission notes, progress notes, operative notes, history and physical examination notes, and discharge summaries.</p></sec><sec id="s2-3-3"><title>Clinical Note Analysis</title><p>To analyze the clinical notes, we used the term frequency&#x2013;inverse document frequency (TF-IDF) approach. The TF-IDF approach [<xref ref-type="bibr" rid="ref16">16</xref>] generates, across the set of notes for each patient, a numeric value for each word. The word value is based on how common the word is in a patient&#x2019;s set of notes (term frequency), divided by how common the word is across all of the patient&#x2019;s notes (inverse document frequency), resulting in a numeric representation for each word on a per-patient basis. Although this is a simple word-based representation, this approach has the following two advantages over deep learning embedding&#x2013;based approaches: (1) it is possible to directly assess the importance of individual words, and (2) the TF-IDF tends to be more robust with small data sets. Notes were extracted as CSV files and concatenated for the entire encounter. We used the <italic>nltk</italic> package in Python (Python Software Foundation) [<xref ref-type="bibr" rid="ref17">17</xref>] to tokenize words into a dictionary. For each document, we calculated word counts and removed any words that appeared fewer than 50 times. We then generated the corresponding weight matrix, which served as a numeric input for downstream analyses.</p></sec></sec><sec id="s2-4"><title>Analytic Approach</title><p>We first described the clinical characteristics of patients hospitalized due to COVID-19 versus those with incidental COVID-19 by using standardized mean differences (SMDs), with an SMD of 0.10 indicating a clinically meaningful difference. Next, we developed 3 classification models for COVID-19&#x2013;specific hospitalization; one was based entirely on structured EHR data elements, a second was based on clinical notes alone, and a third used both structured data elements and clinical notes. We used LASSO (least absolute shrinkage and selection operator) [<xref ref-type="bibr" rid="ref18">18</xref>] logistic regression and random forests [<xref ref-type="bibr" rid="ref19">19</xref>] to estimate the models. Due to the relatively small sample size, we presented our results based on 10-fold cross-validation. We performed the TF-IDF approach separately within each cross-validation fold.</p><p>We evaluated the six classification models by calculating the area under the receiver operator characteristic curve (AUROC), along with associated 95% CIs. We identified the top clinical features and words that appeared in clinical notes based on the LASSO and random forest models. We plotted the precision-recall curve to better understand the performance of a classification model and assessed the impact of different rule-based phenotypes.</p><p>As a way to understand the importance and potential impact of accurate phenotyping, we performed an illustrative association analysis, evaluating the relationship between vaccination status and the following hospital outcome metrics: length of stay, intensive care unit (ICU) utilization, and in-hospital mortality. These were chosen, since they are standard quality metrics for operational purposes. We regressed each outcome onto vaccination status. We used a log-linear model for length of stay and used logistic regression for ICU utilization and in-hospital mortality. Each regression was performed by using the full cohort and compared to a model that only included patients who were determined to have been hospitalized due to COVID-19. We also tested for an interaction between vaccination status and the cause of hospitalization. We emphasize that these were illustrative analyses, and they were not meant to infer any causal effects of vaccination but rather to illustrate the importance of using cause-specific phenotyping for relevant COVID-19 outcomes.</p><p>All work was performed in R version 4.1.2 (R Foundation for Statistical Computing) [<xref ref-type="bibr" rid="ref20">20</xref>] and Python version 3.9.1 (Python Software Foundation) [<xref ref-type="bibr" rid="ref21">21</xref>]. The processing code is available in our GitLab (GitLab Inc) [<xref ref-type="bibr" rid="ref22">22</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Patient Characteristics</title><p>In total, we reviewed the charts of 630 patients who were admitted and tested positive for SARS-CoV-2. After excluding patients younger than 18 years and patients with privacy restrictions, our data set included 586 unique patients who were hospitalized and had tested positive for SARS-CoV-2. Of these, 224 (38.2%) were determined, through clinician review, to have been hospitalized for reasons other than COVID-19. During their assessments, our chart reviewers noted that it was often readily apparent which hospitalizations were attributable to COVID-19 and which were not.</p><p>Characteristics, by admission cause, are shown in <xref ref-type="table" rid="table1">Table 1</xref>. Compared with patients hospitalized for indications other than COVID-19, patients hospitalized due to COVID-19 were, on average, older (age: mean 62.7 years vs mean 51.9 years; SMD 0.587), and their admissions were more commonly labeled as emergency admissions (346/362, 95.6% vs 165/224, 73.7%; SMD 0.641). Furthermore, patients hospitalized due to COVID-19 were substantially more likely to receive COVID-19 therapies, including steroids (233/362, 64.4% vs 54/224, 24.1%; SMD 0.887) and the antiviral agent remdesivir (247/362, 68.2% vs 55/224, 24.6%; SMD 0.974), during their hospitalization. Patients hospitalized due to COVID-19 had lower lymphocyte counts on average compared with those of patients hospitalized for reasons other than COVID-19. Normal levels of C-reactive protein and the lack of dimerized plasmin fragment D (D-dimer) testing were associated with hospitalizations for reasons other than COVID-19.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Cohort description.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="3">Characteristics</td><td align="left" valign="bottom" colspan="3">Hospitalized due to COVID-19</td><td align="left" valign="bottom">Standardized mean difference</td></tr><tr><td align="left" valign="bottom" colspan="3"/><td align="left" valign="top">No (n=224)</td><td align="left" valign="top">Yes (n=362)</td><td align="left" valign="top">Total (N=586)</td><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Sex (female), n (%)</td><td align="left" valign="top">120 (53.6)</td><td align="left" valign="top">181 (50)</td><td align="left" valign="top">301 (51.4)</td><td align="left" valign="top">0.072</td></tr><tr><td align="left" valign="top" colspan="3">Age (years), mean</td><td align="left" valign="top">51.9</td><td align="left" valign="top">62.7</td><td align="left" valign="top">58.6</td><td align="left" valign="top">0.587</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Patient outcome at discharge, n (%)</bold></td><td align="left" valign="top">0.169</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Dead</td><td align="left" valign="top">18 (8)</td><td align="left" valign="top">39 (10.8)</td><td align="left" valign="top">57 (9.7)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Home</td><td align="left" valign="top">176 (78.6)</td><td align="left" valign="top">258 (71.3)</td><td align="left" valign="top">434 (74.1)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Other facility</td><td align="left" valign="top">30 (13.4)</td><td align="left" valign="top">65 (18)</td><td align="left" valign="top">95 (16.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Admission type, n (%)</bold></td><td align="left" valign="top">0.641</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Emergency admission</td><td align="left" valign="top">165 (73.7)</td><td align="left" valign="top">346 (95.6)</td><td align="left" valign="top">511 (87.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Routine elective admission</td><td align="left" valign="top">24 (10.7)</td><td align="left" valign="top">4 (1.1)</td><td align="left" valign="top">28 (4.8)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Urgent admission</td><td align="left" valign="top">35 (15.6)</td><td align="left" valign="top">12 (3.3)</td><td align="left" valign="top">47 (8)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="3">Transfer to intensive care unit, n (%)</td><td align="left" valign="top">45 (20.1)</td><td align="left" valign="top">78 (21.5)</td><td align="left" valign="top">123 (21)</td><td align="left" valign="top">0.036</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Encounter type, n (%)</bold></td><td align="left" valign="top">0.181</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Emergency</td><td align="left" valign="top">2 (0.9)</td><td align="left" valign="top">1 (0.3)</td><td align="left" valign="top">3 (0.5)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Emergency to inpatient</td><td align="left" valign="top">180 (80.4)</td><td align="left" valign="top">314 (86.7)</td><td align="left" valign="top">494 (84.3)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Inpatient</td><td align="left" valign="top">31 (13.8)</td><td align="left" valign="top">35 (9.7)</td><td align="left" valign="top">66 (11.3)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Observation stay</td><td align="left" valign="top">11 (4.9)</td><td align="left" valign="top">12 (3.3)</td><td align="left" valign="top">23 (3.9)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Race and ethnicity, n (%)</bold></td><td align="left" valign="top">0.168</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Hispanic</td><td align="left" valign="top">21 (9.4)</td><td align="left" valign="top">20 (5.5)</td><td align="left" valign="top">41 (7)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Non-Hispanic Black</td><td align="left" valign="top">106 (47.3)</td><td align="left" valign="top">175 (48.3)</td><td align="left" valign="top">281 (48)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Non-Hispanic White</td><td align="left" valign="top">90 (40.2)</td><td align="left" valign="top">152 (42)</td><td align="left" valign="top">242 (41.3)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Non-Hispanic Asian</td><td align="left" valign="top">7 (3.1)</td><td align="left" valign="top">14 (3.9)</td><td align="left" valign="top">21 (3.6)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Other races</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (0.3)</td><td align="left" valign="top">1 (0.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="3">Length of stay (days), mean</td><td align="left" valign="top">10.2</td><td align="left" valign="top">9.9</td><td align="left" valign="top">10</td><td align="left" valign="top">0.026</td></tr><tr><td align="left" valign="top" colspan="6"><bold>BMI, n (%)</bold></td><td align="left" valign="top">0.203</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Missing</td><td align="left" valign="top">9 (4)</td><td align="left" valign="top">9 (2.5)</td><td align="left" valign="top">18 (3.1)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Normal</td><td align="left" valign="top">65 (29)</td><td align="left" valign="top">89 (24.6)</td><td align="left" valign="top">154 (26.3)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Obese</td><td align="left" valign="top">85 (37.9)</td><td align="left" valign="top">147 (40.6)</td><td align="left" valign="top">232 (39.6)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Overweight</td><td align="left" valign="top">60 (26.8)</td><td align="left" valign="top">98 (27.1)</td><td align="left" valign="top">158 (27)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Underweight</td><td align="left" valign="top">5 (2.2)</td><td align="left" valign="top">19 (5.2)</td><td align="left" valign="top">24 (4.1)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Raw payer type value, n (%)</bold></td><td align="left" valign="top">0.305</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Private</td><td align="left" valign="top">102 (45.5)</td><td align="left" valign="top">180 (49.7)</td><td align="left" valign="top">282 (48.1)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Public</td><td align="left" valign="top">88 (39.3)</td><td align="left" valign="top">144 (39.8)</td><td align="left" valign="top">232 (39.6)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Self-pay</td><td align="left" valign="top">21 (9.4)</td><td align="left" valign="top">9 (2.5)</td><td align="left" valign="top">30 (5.1)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Other</td><td align="left" valign="top">13 (5.8)</td><td align="left" valign="top">29 (8)</td><td align="left" valign="top">42 (7.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="3">Vaccinated against COVID-19, n (%)</td><td align="left" valign="top">113 (50.4)</td><td align="left" valign="top">178 (49.2)</td><td align="left" valign="top">291 (49.7)</td><td align="left" valign="top">0.026</td></tr><tr><td align="left" valign="top" colspan="7"><bold>Comorbidities, n (%)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Surgery</td><td align="left" valign="top">200 (89.3)</td><td align="left" valign="top">302 (83.4)</td><td align="left" valign="top">502 (85.7)</td><td align="left" valign="top">0.171</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Cancer</td><td align="left" valign="top">29 (12.9)</td><td align="left" valign="top">45 (12.4)</td><td align="left" valign="top">74 (12.6)</td><td align="left" valign="top">0.015</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Cardiovascular</td><td align="left" valign="top">75 (33.5)</td><td align="left" valign="top">146 (40.3)</td><td align="left" valign="top">221 (37.7)</td><td align="left" valign="top">0.142</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Hypertension</td><td align="left" valign="top">73 (32.6)</td><td align="left" valign="top">151 (41.7)</td><td align="left" valign="top">224 (38.2)</td><td align="left" valign="top">0.19</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Chronic liver disease</td><td align="left" valign="top">30 (13.4)</td><td align="left" valign="top">46 (12.7)</td><td align="left" valign="top">76 (13)</td><td align="left" valign="top">0.02</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Chronic obstructive pulmonary disease</td><td align="left" valign="top">21 (9.4)</td><td align="left" valign="top">50 (13.8)</td><td align="left" valign="top">71 (12.1)</td><td align="left" valign="top">0.139</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Asthma</td><td align="left" valign="top">18 (8)</td><td align="left" valign="top">39 (10.8)</td><td align="left" valign="top">57 (9.7)</td><td align="left" valign="top">0.094</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Chronic renal disease</td><td align="left" valign="top">44 (19.6)</td><td align="left" valign="top">111 (30.7)</td><td align="left" valign="top">155 (26.5)</td><td align="left" valign="top">0.256</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Diabetes</td><td align="left" valign="top">45 (20.1)</td><td align="left" valign="top">103 (28.5)</td><td align="left" valign="top">148 (25.3)</td><td align="left" valign="top">0.196</td></tr><tr><td align="left" valign="top" colspan="7"><bold>Medications, n (%)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Bronchodilator</td><td align="left" valign="top">44 (19.6)</td><td align="left" valign="top">159 (41.2)</td><td align="left" valign="top">193 (32.9)</td><td align="left" valign="top">0.481</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Steroid</td><td align="left" valign="top">54 (24.1)</td><td align="left" valign="top">233 (64.4)</td><td align="left" valign="top">287 (49)</td><td align="left" valign="top">0.887</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Anticoagulant antiplatelet</td><td align="left" valign="top">121 (54)</td><td align="left" valign="top">284 (78.5)</td><td align="left" valign="top">405 (69.1)</td><td align="left" valign="top">0.535</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Diuretic</td><td align="left" valign="top">60 (26.8)</td><td align="left" valign="top">131 (36.2)</td><td align="left" valign="top">191 (32.6)</td><td align="left" valign="top">0.203</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Cough suppressant</td><td align="left" valign="top">44 (19.6)</td><td align="left" valign="top">162 (44.8)</td><td align="left" valign="top">206 (35.2)</td><td align="left" valign="top">0.558</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Paralytic</td><td align="left" valign="top">10 (4.5)</td><td align="left" valign="top">30 (8.3)</td><td align="left" valign="top">40 (6.8)</td><td align="left" valign="top">0.157</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Expectorant</td><td align="left" valign="top">14 (6.3)</td><td align="left" valign="top">56 (15.5)</td><td align="left" valign="top">70 (11.9)</td><td align="left" valign="top">0.3</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Remdesivir</td><td align="left" valign="top">55 (24.6)</td><td align="left" valign="top">247 (68.2)</td><td align="left" valign="top">302 (51.5)</td><td align="left" valign="top">0.974</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">Inhaled steroid</td><td align="left" valign="top">24 (10.7)</td><td align="left" valign="top">42 (11.6)</td><td align="left" valign="top">66 (11.3)</td><td align="left" valign="top">0.028</td></tr><tr><td align="left" valign="top" colspan="7"><bold>Laboratory tests, n (%)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="5"><bold>Absolute lymphocyte count</bold></td><td align="left" valign="top">0.345</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">High</td><td align="left" valign="top">1 (0.4)</td><td align="left" valign="top">2 (0.6)</td><td align="left" valign="top">3 (0.5)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Low</td><td align="left" valign="top">12 (5.4)</td><td align="left" valign="top">47 (13)</td><td align="left" valign="top">59 (10.1)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Normal</td><td align="left" valign="top">23 (10.3)</td><td align="left" valign="top">59 (16.3)</td><td align="left" valign="top">82 (14)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Not taken</td><td align="left" valign="top">188 (83.9)</td><td align="left" valign="top">254 (70.2)</td><td align="left" valign="top">442 (75.4)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="5"><bold>Lymphocyte count</bold></td><td align="left" valign="top">0.528</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Low</td><td align="left" valign="top">17 (7.6)</td><td align="left" valign="top">71 (19.6)</td><td align="left" valign="top">88 (15)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Normal</td><td align="left" valign="top">131 (58.5)</td><td align="left" valign="top">233 (64.4)</td><td align="left" valign="top">364 (62.1)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Not taken</td><td align="left" valign="top">76 (33.9)</td><td align="left" valign="top">56 (15.5)</td><td align="left" valign="top">132 (22.5)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">High</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">2 (0.6)</td><td align="left" valign="top">2 (0.3)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="5"><bold>C-reactive protein</bold></td><td align="left" valign="top">0.602</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">High</td><td align="left" valign="top">62 (27.7)</td><td align="left" valign="top">203 (56.1)</td><td align="left" valign="top">265 (45.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Normal</td><td align="left" valign="top">11 (4.9)</td><td align="left" valign="top">9 (2.5)</td><td align="left" valign="top">20 (3.4)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Not taken</td><td align="left" valign="top">151 (67.4)</td><td align="left" valign="top">150 (41.4)</td><td align="left" valign="top">301 (51.4)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="5"><bold>Ferritin</bold></td><td align="left" valign="top">0.361</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">High</td><td align="left" valign="top">39 (17.4)</td><td align="left" valign="top">107 (29.6)</td><td align="left" valign="top">146 (24.9)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Low</td><td align="left" valign="top">2 (0.9)</td><td align="left" valign="top">3 (0.8)</td><td align="left" valign="top">5 (0.9)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Normal</td><td align="left" valign="top">17 (7.6)</td><td align="left" valign="top">44 (12.2)</td><td align="left" valign="top">61 (10.4)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Not taken</td><td align="left" valign="top">166 (74.1)</td><td align="left" valign="top">208 (57.5)</td><td align="left" valign="top">374 (63.8)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="5"><bold>D-dimer<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></bold></td><td align="left" valign="top">1.187</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">High</td><td align="left" valign="top">19 (8.5)</td><td align="left" valign="top">117 (32.3)</td><td align="left" valign="top">136 (23.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Normal</td><td align="left" valign="top">36 (16.1)</td><td align="left" valign="top">156 (43.1)</td><td align="left" valign="top">192 (32.8)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Not taken</td><td align="left" valign="top">169 (75.4)</td><td align="left" valign="top">89 (24.6)</td><td align="left" valign="top">258 (44)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="5"><bold>Procalcitonin</bold></td><td align="left" valign="top">0.524</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">High</td><td align="left" valign="top">4 (1.8)</td><td align="left" valign="top">22 (6.1)</td><td align="left" valign="top">26 (4.4)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Missing</td><td align="left" valign="top">208 (92.9)</td><td align="left" valign="top">268 (74)</td><td align="left" valign="top">476 (81.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Normal</td><td align="left" valign="top">12 (5.4)</td><td align="left" valign="top">72 (19.9)</td><td align="left" valign="top">84 (14.3)</td><td align="left" valign="top">&#x2003;</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>D-dimer: dimerized plasmin fragment D.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Performance of Classification Models</title><p>After tokenizing words and removing terms with fewer than 50 occurrences, our models included 7953 unique terms. There was minimal difference between the LASSO and random forest models. The random forest model based solely on clinical notes, the one based solely on structured data elements, and the one that used both clinical notes and structured data elements had AUROCs of 0.882 (95% CI 0.85-0.909), 0.829 (95% CI 0.794-0.864), and 0.890 (95% CI 0.864-0.916), respectively. The LASSO model based solely on clinical notes (AUROC=0.894, 95% CI 0.868-0.920) had better discrimination than the LASSO model based solely on structured data elements (AUROC=0.841, 95% CI 0.809-0.874; <italic>P</italic>&#x003C;.001). The LASSO model using both clinical notes and structured data elements (AUROC=0.893, 95% CI 0.868-0.919) had similar discrimination to that of the LASSO model based solely on clinical notes (<italic>P</italic>=.91).</p><p>Next, we examined the top structured data elements and terms in each model (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Highly predictive data elements and words corresponded to patient characteristics with large SMDs (<xref ref-type="table" rid="table1">Table 1</xref>). Words that are reflective of hospitalization due to COVID-19 have positive coefficients, while words reflective of hospitalization for other reasons have negative coefficients. Terms reflective of COVID-19&#x2013;specific hospitalization were related to the care of patients with COVID-19, such as &#x201C;remdesivir&#x201D; and &#x201C;dexamethason.&#x201D; Other structured elements related to the likelihood of being hospitalized for COVID-19 included receipt of steroids, low lymphocyte counts, and underweight BMIs. Terms reflective of hospitalizations due to indications other than COVID-19 included strings that may be related to surgical procedures (eg, &#x201C;surgic&#x201D; for &#x201C;surgical&#x201D; or &#x201C;dress&#x201D; for &#x201C;dressing&#x201D;). For structured data elements, a lack of D-dimer collection and low ferritin levels were most commonly associated with admissions for reasons other than COVID-19. Similar features were identified from the random forest model (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The top regression coefficients from the LASSO models, as reflective of variable importance for (A) the model using just structured data elements and (B) the model using just clinical notes. Values greater than 0 indicate that the feature has a positive association with hospitalization due to COVID-19, while values less than 0 indicate that a feature has a negative association.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v11i1e46267_fig01.png"/></fig></sec><sec id="s3-3"><title>Impact of Correct Classification</title><p>In order to assess the performance of a computable phenotype&#x2013;based decision rule, we examined the precision-recall curve of the different models (<xref ref-type="fig" rid="figure2">Figure 2</xref>). For example, a rule that maintains a sensitivity of 90% (ie, one that would capture 90% of all patients hospitalized due to COVID-19) resulted in positive predictive values of 76%, 82%, and 84% and corresponding <italic>F</italic><sub>1</sub>-scores of 0.824, 0.858, and 0.869 based on structured data elements, clinical notes, and their combination, respectively. To illustrate the impact of these differences, we considered the impact of implementing each of these phenotypes at a 90% sensitivity to classify patients during the January Omicron wave. Within our health system, 1378 people were hospitalized and tested positive for SARS-CoV-2. Based on our analyses, using the LASSO-based phenotype that incorporates structured data, clinical notes, or their combination would result in approximately 244, 165, and 142 false positives, respectively.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Precision-recall (positive predictive value and sensitivity) curve for the different classification algorithms. This illustrates the trade-off between the identification of patients hospitalized due to COVID-19 (x-axis: sensitivity) and the accuracy of that capture (y-axis: positive predictive value). There is minimal difference between using just notes or notes with structured data elements. The model with only structured data elements performs notably worse in terms of positive predictive value at the same sensitivity thresholds. AUPRC: area under the precision-recall curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v11i1e46267_fig02.png"/></fig><p>We next sought to evaluate the potential impact of different phenotyping methods on hospital outcome metrics, comparing a method that incorporates the reason for hospitalization versus one that does not. We used a regression analysis to assess the marginal relationship. As a use case, we evaluated associations between vaccine status and the following three hospital outcome metrics: length of stay, risk of ICU utilization, and in-hospital mortality. These evaluations were performed with the following three cohorts: all hospitalized patients, those who were determined to have been hospitalized due to COVID-19, and those who tested positive for SARS-CoV-2 but were hospitalized for unrelated reasons (<xref ref-type="table" rid="table2">Table 2</xref>). For length of stay, the magnitude of the effect of vaccine status changed based on the cohort used. In the cohort of all hospitalized patients, vaccinated patients had a shorter length of stay (relative rate 0.81, 95% CI 0.71-0.93). However, when limiting the analytic cohort to patients hospitalized due to COVID-19, there was no significant difference in length of stay for vaccinated patients versus unvaccinated patients (relative rate 0.98, 95% CI 0.83-1.16; <italic>P</italic> value for interaction&#x003C;.001). We found similar patterns in analyses of other in-hospital outcomes; vaccination was associated with reduced risks of ICU utilization and in-hospital mortality among patients hospitalized for reasons other than COVID-19 when compared to those among patients hospitalized due to COVID-19. Effects were robust to adjustment for age (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). These results illustrate the impact of selecting the correct cohort for analysis and the potential ramifications of using a cohort in which the reason for hospitalization has not been determined.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Marginal association between vaccine status<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> and outcome metrics, unadjusted for age.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Outcome</td><td align="left" valign="top">Full cohort</td><td align="left" valign="top">Hospitalized due to COVID-19</td><td align="left" valign="top">Hospitalization unrelated to COVID-19</td><td align="left" valign="top"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Length of stay, relative rate (95% CI)</td><td align="left" valign="top">0.81 (0.71-0.93)</td><td align="left" valign="top">0.98 (0.83-1.16)</td><td align="left" valign="top">0.59 (0.47-0.74)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">ICU<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> utilization, odds ratio (95% CI)</td><td align="left" valign="top">1.04 (0.70-1.56)</td><td align="left" valign="top">1.25 (0.75-2.07)</td><td align="left" valign="top">0.77 (0.40-1.49)</td><td align="left" valign="top">.26</td></tr><tr><td align="left" valign="top">Mortality, odds ratio (95% CI)</td><td align="left" valign="top">1.02 (0.59-1.78)</td><td align="left" valign="top">1.45 (0.74-2.88)</td><td align="left" valign="top">0.48 (0.16-1.29)</td><td align="left" valign="top">.08</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Unvaccinated patients are the reference group.</p></fn><fn id="table2fn2"><p><sup>b</sup><italic>P</italic> value is for hospitalization due to COVID-19 versus hospitalization unrelated to COVID-19.</p></fn><fn id="table2fn3"><p><sup>c</sup>ICU: intensive care unit.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Due to the public health importance of the accurate identification of COVID-19&#x2013;related hospitalizations, there is a need for methods and computable phenotypes to identify hospital admissions in which the primary cause is COVID-19 [<xref ref-type="bibr" rid="ref23">23</xref>]. We used machine learning methods and a physician chart review to develop a classification algorithm for hospitalization due to COVID-19. We found that 38.2% (224/586) of patients who were hospitalized at our institution during the Omicron wave and tested positive for SARS-CoV-2 infection were hospitalized for reasons other than COVID-19. These findings are in line with other recent studies, which found that an average of 26% of hospitalized patients with a positive SARS-CoV-2 test result had a primary indication for hospitalization that was unrelated to COVID-19 [<xref ref-type="bibr" rid="ref14">14</xref>]. We found that a model based on clinical notes performed better than one based solely on structured EHR data elements. This work has important implications for retrospective analyses using EHR data to assess outcomes related to COVID-19, including vaccine effectiveness and health system capacity [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>Prior work by Lynch and colleagues [<xref ref-type="bibr" rid="ref25">25</xref>] evaluated the utility of <italic>ICD-10</italic> codes for COVID-19 diagnosis in inpatient, outpatient, emergency care, and urgent care settings during time periods across the pandemic; using a weighted, random sample of 1500 records from the Department of Veterans Affairs, they found that the COVID-19 <italic>ICD-10</italic> code (U07.1) had a relatively low positive predictive value across settings and time periods. These findings highlight the need for additional contextual data to identify acute cases of COVID-19. The Consortium for Clinical Characterization of COVID-19 by EHR (4CE) conducted a similar study of EHR data from 12 clinical sites to identify combinations of structured data elements to generate a reliable computable phenotype for hospitalization due to COVID-19, with a reported AUROC of 0.903 [<xref ref-type="bibr" rid="ref26">26</xref>]. Similarly, we derived an AUROC of 0.841 based solely on structured data elements; however, we also found that that inclusion of clinical notes significantly improved the performance of the classification model (AUROC=0.893; <italic>P</italic>&#x003C;.001). This result is not surprising, as the clinical narrative often includes important nuance, and as our chart reviewers noted, it was often readily apparent which hospitalizations were attributable to COVID-19 and which were not. Of note, chart reviewers in our study classified hospitalizations that were indirectly due to SARS-CoV-2 infection, such as those due to COVID-19&#x2013;related weakness or delirium, as hospitalizations due to COVID-19, which could partly explain the observed difference in discriminatory ability between our study and the study conducted by the 4CE.</p><p>By using the TF-IDF approach in conjunction with LASSO regression, we identified both individual terms and the direction of the association between each term and the hospitalization indication. Although the TF-IDF approach is a simple natural language processing (NLP) approach, it is also very scalable, interpretable, and implementable. Our results highlight the power of even simple natural language models. The terms that best predicted hospitalizations due to COVID-19 included common descriptors that were used in the clinical care of patients with COVID-19, such as &#x201C;hypox&#x201D; (likely shortened from &#x201C;hypoxia&#x201D; or &#x201C;hypoxic&#x201D;), or COVID-19 therapies like remdesivir. Conversely, the terms that were not associated with hospitalizations due to COVID-19 included words related to surgery&#x2014;a common indication for hospital admission that is generally unrelated to SARS-CoV-2 infection.</p><p>To help contextualize our results, we also assessed the real-world impact of using an accurate phenotype for COVID-19&#x2013;specific hospitalization. In studying hospitalized patients with COVID-19, the simplest analysis would be to include all patients with a COVID-19&#x2013;positive test result. As our illustrative analysis showed, when using this full but heterogeneous cohort, the results suggested that vaccination status is associated with a shorter length of stay. However, when we limited the analysis to only include patients who were identified as having been hospitalized due to COVID-19 (ie, people with symptoms of COVID-19), the analysis indicated that vaccines are not associated with a shorter length of stay. We interpreted these data as indicating that, conditional on someone being sick enough to be hospitalized due to COVID-19, vaccines provide no additional benefit in terms of the length of hospitalization. Similar patterns were found for other hospital outcome metrics. Although this analysis was not intended to be a causal analysis, it did illustrate how the use of accurately classified cohorts is important for the calculation of standard outcome metrics and likely impacts other related association analyses.</p><p>More broadly, this work highlights the importance and challenge of phenotyping cause-specific events. Although there is rich literature on computable phenotypes, most of this literature is geared toward the identification of chronic diseases (eg, presence of asthma). However, few computable phenotypes have focused on cause-specific events (eg, asthma exacerbation). Such cause-specific phenotypes often exhibit poor specificity and can require algorithms that are more complex than those required for chronic conditions. As this work shows, and as suggested by others, NLP-based phenotyping approaches are becoming more common, and further comparisons between NLP approaches and other methods will be needed to determine whether using text data can improve cause-specific phenotypes.</p><p>Although our study used rigorous methods, there are some key limitations. First and most notably, our findings are primarily illustrative and may not represent a generalizable algorithm for phenotyping COVID-19&#x2013;specific hospitalizations. This study was conducted across a single hospital system, and it may not be reflective of practices at other institutions. Importantly, we would not expect our specific phenotype algorithm to be generalizable to other institutions. Second, we only looked at 1 period of time, namely the January 2022 Omicron wave; however, there are documented differences in the rate of hospitalization and positive test results over the course of the pandemic, and our models may not accurately reflect distinguishing factors in other waves. Third, another limitation is that, given the time constraints of chart reviews, we were only able to analyze a relatively small sample. In particular, the small sample size limited our ability to apply more sophisticated NLP-based approaches, such as the use of n-grams.</p></sec><sec id="s4-2"><title>Conclusions</title><p>Overall, our results show that a sizable number of people who were hospitalized and tested positive for SARS-CoV-2 were hospitalized for reasons other than COVID-19. The conflation of these individuals can impact our understanding of hospital outcome metrics. We constructed a strong classification model that can be used as a computable phenotype to distinguish patients who were hospitalized due to COVID-19 from those who incidentally tested positive for SARS-CoV-2 but were hospitalized for other reasons. Moreover, we found that while structured data elements are useful in constructing such a phenotype, clinical notes had a higher positive predictive value than that of structured data elements alone. Future work should seek to explore the generalizability of such phenotypes across institutions and different waves of the COVID-19 pandemic.</p></sec></sec></body><back><ack><p>This work was supported by Food and Drug Administration Broad Agency Announcement (FDA BAA) 75F40121C00158 (principal investigator: BAG). We thank Mike Chrestensen for support in extracting the clinical notes for this study.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">4CE</term><def><p>Consortium for Clinical Characterization of COVID-19 by EHR</p></def></def-item><def-item><term id="abb2">AUROC</term><def><p>area under the receiver operator characteristic curve</p></def></def-item><def-item><term id="abb3">D-dimer</term><def><p>dimerized plasmin fragment D</p></def></def-item><def-item><term id="abb4">DUHS</term><def><p>Duke University Health System</p></def></def-item><def-item><term id="abb5">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb6"><italic>ICD-10</italic></term><def><p><italic>International Classification of Diseases, 10th Revision</italic></p></def></def-item><def-item><term id="abb7">ICU</term><def><p>intensive care unit</p></def></def-item><def-item><term id="abb8">LASSO</term><def><p>least absolute shrinkage and selection operator</p></def></def-item><def-item><term id="abb9">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb10">SMD</term><def><p>standardized mean difference</p></def></def-item><def-item><term id="abb11">TF-IDF</term><def><p>term frequency&#x2013;inverse document frequency</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehrotra</surname><given-names>DV</given-names></name><name name-style="western"><surname>Janes</surname><given-names>HE</given-names></name><name name-style="western"><surname>Fleming</surname><given-names>TR</given-names></name><name name-style="western"><surname>Annunziato</surname><given-names>PW</given-names></name><name name-style="western"><surname>Neuzil</surname><given-names>KM</given-names></name><name name-style="western"><surname>Carpp</surname><given-names>LN</given-names></name><etal/></person-group><article-title>Clinical endpoints for evaluating efficacy in COVID-19 vaccine trials</article-title><source>Ann Intern Med</source><year>2021</year><month>02</month><volume>174</volume><issue>2</issue><fpage>221</fpage><lpage>228</lpage><pub-id pub-id-type="doi">10.7326/M20-6169</pub-id><pub-id pub-id-type="medline">33090877</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Centers for Disease Control and Prevention</collab></person-group><article-title>Science brief: indicators for monitoring COVID-19 community levels and making public health recommendations</article-title><access-date>2023-01-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/coronavirus/2019-ncov/science/science-briefs/indicators-monitoring-community-levels.html">www.cdc.gov/coronavirus/2019-ncov/science/science-briefs/indicators-monitoring-community-levels.html</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fillmore</surname><given-names>NR</given-names></name><name name-style="western"><surname>La</surname><given-names>J</given-names></name><name name-style="western"><surname>Zheng</surname><given-names>C</given-names></name><name name-style="western"><surname>Doron</surname><given-names>S</given-names></name><name name-style="western"><surname>Do</surname><given-names>NV</given-names></name><name name-style="western"><surname>Monach</surname><given-names>PA</given-names></name><etal/></person-group><article-title>The COVID-19 hospitalization metric in the pre- and postvaccination eras as a measure of pandemic severity: a retrospective, nationwide cohort study</article-title><source>Infect Control Hosp Epidemiol</source><year>2022</year><month>12</month><volume>43</volume><issue>12</issue><fpage>1767</fpage><lpage>1772</lpage><pub-id pub-id-type="doi">10.1017/ice.2022.13</pub-id><pub-id pub-id-type="medline">35012694</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Le&#x00F3;n</surname><given-names>TM</given-names></name><name name-style="western"><surname>Dorabawila</surname><given-names>V</given-names></name><name name-style="western"><surname>Nelson</surname><given-names>L</given-names></name><name name-style="western"><surname>Lutterloh</surname><given-names>E</given-names></name><name name-style="western"><surname>Bauer</surname><given-names>UE</given-names></name><name name-style="western"><surname>Backenson</surname><given-names>B</given-names></name><etal/></person-group><article-title>COVID-19 cases and hospitalizations by COVID-19 vaccination status and previous COVID-19 diagnosis - California and New York, May-November 2021</article-title><source>MMWR Morb Mortal Wkly Rep</source><year>2022</year><month>01</month><day>28</day><volume>71</volume><issue>4</issue><fpage>125</fpage><lpage>131</lpage><pub-id pub-id-type="doi">10.15585/mmwr.mm7104e1</pub-id><pub-id pub-id-type="medline">35085222</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Fall</surname><given-names>A</given-names></name><name name-style="western"><surname>Eldesouki</surname><given-names>RE</given-names></name><name name-style="western"><surname>Sachithanandham</surname><given-names>J</given-names></name><name name-style="western"><surname>Morris</surname><given-names>CP</given-names></name><name name-style="western"><surname>Norton</surname><given-names>JM</given-names></name><name name-style="western"><surname>Gaston</surname><given-names>DC</given-names></name><etal/></person-group><article-title>A quick displacement of the SARS-CoV-2 variant Delta with Omicron: unprecedented spike in COVID-19 cases associated with fewer admissions and comparable upper respiratory viral loads</article-title><source>medRxiv. Preprint posted online on January 28, 2022</source><pub-id pub-id-type="doi">10.1101/2022.01.26.22269927</pub-id><pub-id pub-id-type="medline">35118480</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stowe</surname><given-names>J</given-names></name><name name-style="western"><surname>Andrews</surname><given-names>N</given-names></name><name name-style="western"><surname>Kirsebom</surname><given-names>F</given-names></name><name name-style="western"><surname>Ramsay</surname><given-names>M</given-names></name><name name-style="western"><surname>Bernal</surname><given-names>JL</given-names></name></person-group><article-title>Effectiveness of COVID-19 vaccines against Omicron and Delta hospitalisation, a test negative case-control study</article-title><source>Nat Commun</source><year>2022</year><month>09</month><day>30</day><volume>13</volume><issue>1</issue><fpage>5736</fpage><pub-id pub-id-type="doi">10.1038/s41467-022-33378-7</pub-id><pub-id pub-id-type="medline">36180428</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Havers</surname><given-names>FP</given-names></name><name name-style="western"><surname>Patel</surname><given-names>K</given-names></name><name name-style="western"><surname>Whitaker</surname><given-names>M</given-names></name><name name-style="western"><surname>Milucky</surname><given-names>J</given-names></name><name name-style="western"><surname>Reingold</surname><given-names>A</given-names></name><name name-style="western"><surname>Armistead</surname><given-names>I</given-names></name><etal/></person-group><article-title>Laboratory-confirmed COVID-19-associated hospitalizations among adults during SARS-CoV-2 Omicron BA.2 variant predominance - COVID-19-Associated Hospitalization Surveillance Network, 14 States, June 20, 2021-May 31, 2022</article-title><source>MMWR Morb Mortal Wkly Rep</source><year>2022</year><month>08</month><day>26</day><volume>71</volume><issue>34</issue><fpage>1085</fpage><lpage>1091</lpage><pub-id pub-id-type="doi">10.15585/mmwr.mm7134a3</pub-id><pub-id pub-id-type="medline">36006841</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Taylor</surname><given-names>CA</given-names></name><name name-style="western"><surname>Whitaker</surname><given-names>M</given-names></name><name name-style="western"><surname>Anglin</surname><given-names>O</given-names></name><name name-style="western"><surname>Milucky</surname><given-names>J</given-names></name><name name-style="western"><surname>Patel</surname><given-names>K</given-names></name><name name-style="western"><surname>Pham</surname><given-names>H</given-names></name><etal/></person-group><article-title>COVID-19-associated hospitalizations among adults during SARS-CoV-2 Delta and Omicron variant predominance, by race/ethnicity and vaccination status - COVID-NET, 14 States, July 2021-January 2022</article-title><source>MMWR Morb Mortal Wkly Rep</source><year>2022</year><month>03</month><day>25</day><volume>71</volume><issue>12</issue><fpage>466</fpage><lpage>473</lpage><pub-id pub-id-type="doi">10.15585/mmwr.mm7112e2</pub-id><pub-id pub-id-type="medline">35324880</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hilal</surname><given-names>W</given-names></name><name name-style="western"><surname>Chislett</surname><given-names>MG</given-names></name><name name-style="western"><surname>Snider</surname><given-names>B</given-names></name><name name-style="western"><surname>McBean</surname><given-names>EA</given-names></name><name name-style="western"><surname>Yawney</surname><given-names>J</given-names></name><name name-style="western"><surname>Gadsden</surname><given-names>SA</given-names></name></person-group><article-title>Use of AI to assess COVID-19 variant impacts on hospitalization, ICU, and death</article-title><source>Front Artif Intell</source><year>2022</year><month>11</month><day>30</day><volume>5</volume><fpage>927203</fpage><pub-id pub-id-type="doi">10.3389/frai.2022.927203</pub-id><pub-id pub-id-type="medline">36530359</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Jes&#x00FA;s Ascencio-Montiel</surname><given-names>I</given-names></name><name name-style="western"><surname>Ovalle-Luna</surname><given-names>OD</given-names></name><name name-style="western"><surname>Rasc&#x00F3;n-Pacheco</surname><given-names>RA</given-names></name><name name-style="western"><surname>Borja-Aburto</surname><given-names>VH</given-names></name><name name-style="western"><surname>Chowell</surname><given-names>G</given-names></name></person-group><article-title>Comparative epidemiology of five waves of COVID-19 in Mexico, March 2020-August 2022</article-title><source>BMC Infect Dis</source><year>2022</year><month>10</month><day>31</day><volume>22</volume><issue>1</issue><fpage>813</fpage><pub-id pub-id-type="doi">10.1186/s12879-022-07800-w</pub-id><pub-id pub-id-type="medline">36316634</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chu</surname><given-names>VT</given-names></name><name name-style="western"><surname>Schwartz</surname><given-names>NG</given-names></name><name name-style="western"><surname>Donnelly</surname><given-names>MAP</given-names></name><name name-style="western"><surname>Chuey</surname><given-names>MR</given-names></name><name name-style="western"><surname>Soto</surname><given-names>R</given-names></name><name name-style="western"><surname>Yousaf</surname><given-names>AR</given-names></name><etal/></person-group><article-title>Comparison of home antigen testing with RT-PCR and viral culture during the course of SARS-CoV-2 infection</article-title><source>JAMA Intern Med</source><year>2022</year><month>07</month><day>1</day><volume>182</volume><issue>7</issue><fpage>701</fpage><lpage>709</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2022.1827</pub-id><pub-id pub-id-type="medline">35486394</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chin</surname><given-names>ET</given-names></name><name name-style="western"><surname>Huynh</surname><given-names>BQ</given-names></name><name name-style="western"><surname>Chapman</surname><given-names>LAC</given-names></name><name name-style="western"><surname>Murrill</surname><given-names>M</given-names></name><name name-style="western"><surname>Basu</surname><given-names>S</given-names></name><name name-style="western"><surname>Lo</surname><given-names>NC</given-names></name></person-group><article-title>Frequency of routine testing for COVID-19 in high-risk healthcare environments to reduce outbreaks</article-title><source>medRxiv. Preprint posted online on September 9, 2020</source><pub-id pub-id-type="doi">10.1101/2020.04.30.20087015</pub-id><pub-id pub-id-type="medline">32511523</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rickman</surname><given-names>HM</given-names></name><name name-style="western"><surname>Rampling</surname><given-names>T</given-names></name><name name-style="western"><surname>Shaw</surname><given-names>K</given-names></name><name name-style="western"><surname>Martinez-Garcia</surname><given-names>G</given-names></name><name name-style="western"><surname>Hail</surname><given-names>L</given-names></name><name name-style="western"><surname>Coen</surname><given-names>P</given-names></name><etal/></person-group><article-title>Nosocomial transmission of coronavirus disease 2019: a retrospective study of 66 hospital-acquired cases in a London teaching hospital</article-title><source>Clin Infect Dis</source><year>2021</year><month>02</month><day>16</day><volume>72</volume><issue>4</issue><fpage>690</fpage><lpage>693</lpage><pub-id pub-id-type="doi">10.1093/cid/ciaa816</pub-id><pub-id pub-id-type="medline">32562422</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klann</surname><given-names>JG</given-names></name><name name-style="western"><surname>Strasser</surname><given-names>ZH</given-names></name><name name-style="western"><surname>Hutch</surname><given-names>MR</given-names></name><name name-style="western"><surname>Kennedy</surname><given-names>CJ</given-names></name><name name-style="western"><surname>Marwaha</surname><given-names>JS</given-names></name><name name-style="western"><surname>Morris</surname><given-names>M</given-names></name><etal/></person-group><article-title>Distinguishing admissions specifically for COVID-19 from incidental SARS-CoV-2 admissions: national retrospective electronic health record study</article-title><source>J Med Internet Res</source><year>2022</year><month>05</month><day>18</day><volume>24</volume><issue>5</issue><fpage>e37931</fpage><pub-id pub-id-type="doi">10.2196/37931</pub-id><pub-id pub-id-type="medline">35476727</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hurst</surname><given-names>JH</given-names></name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names></name><name name-style="western"><surname>Maxson</surname><given-names>PJ</given-names></name><name name-style="western"><surname>Permar</surname><given-names>SR</given-names></name><name name-style="western"><surname>Boulware</surname><given-names>LE</given-names></name><name name-style="western"><surname>Goldstein</surname><given-names>BA</given-names></name></person-group><article-title>Development of an electronic health records datamart to support clinical and population health research</article-title><source>J Clin Transl Sci</source><year>2020</year><month>06</month><day>23</day><volume>5</volume><issue>1</issue><fpage>e13</fpage><pub-id pub-id-type="doi">10.1017/cts.2020.499</pub-id><pub-id pub-id-type="medline">33948239</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Uther</surname><given-names>W</given-names></name><name name-style="western"><surname>Mladeni&#x0107;</surname><given-names>D</given-names></name><name name-style="western"><surname>Ciaramita</surname><given-names>M</given-names></name><name name-style="western"><surname>Berendt</surname><given-names>B</given-names></name><name name-style="western"><surname>Ko&#x0142;cz</surname><given-names>A</given-names></name><name name-style="western"><surname>Grobelnik</surname><given-names>M</given-names></name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Sammut</surname><given-names>C</given-names></name><name name-style="western"><surname>Webb</surname><given-names>GI</given-names></name></person-group><article-title>TF&#x2013;IDF</article-title><source>Encyclopedia of Machine Learning</source><year>2011</year><publisher-loc>Boston, MA</publisher-loc><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-0-387-30164-8</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>NLTK</collab></person-group><article-title>Natural language toolkit</article-title><access-date>2022-10-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nltk.org/">www.nltk.org/</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tibshirani</surname><given-names>R</given-names></name></person-group><article-title>Regression shrinkage and selection via the lasso</article-title><source>J R Stat Soc Series B Stat Methodol</source><year>1996</year><month>01</month><volume>58</volume><issue>1</issue><fpage>267</fpage><lpage>288</lpage><pub-id pub-id-type="doi">10.1111/j.2517-6161.1996.tb02080.x</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names></name></person-group><article-title>Random forests</article-title><source>Mach Learn</source><year>2001</year><month>10</month><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>R Foundation for Statistical Computing</collab></person-group><article-title>The R project for statistical computing</article-title><year>2021</year><access-date>2023-07-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://www.R-project.org/">www.R-project.org/</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Python Software Foundation</collab></person-group><article-title>Python release Python 3.9.1</article-title><access-date>2023-01-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.python.org/downloads/release/python-391/">www.python.org/downloads/release/python-391/</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>GitLab</collab></person-group><article-title>Using nlp to identify computable phenotype of patients hospitalized because of COVID-19</article-title><access-date>2023-07-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://gitlab.com/changalice980/using-nlp-to-identify-computable-phenotype-of-patients-hospitalized-because-of-covid-19">gitlab.com/changalice980/using-nlp-to-identify-computable-phenotype-of-patients-hospitalized-because-of-covid-19</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>U.S. Department of Health and Human Services</collab></person-group><article-title>COVID-19 guidance for hospital reporting and FAQs for hospitals, hospital laboratory, and acute care facility data reporting</article-title><access-date>2023-07-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.hhs.gov/sites/default/files/covid-19-faqs-hospitals-hospital-laboratory-acute-care-facility-data-reporting.pdf">www.hhs.gov/sites/default/files/covid-19-faqs-hospitals-hospital-laboratory-acute-care-facility-data-reporting.pdf</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Satterfield</surname><given-names>BA</given-names></name><name name-style="western"><surname>Dikilitas</surname><given-names>O</given-names></name><name name-style="western"><surname>Kullo</surname><given-names>IJ</given-names></name></person-group><article-title>Leveraging the electronic health record to address the COVID-19 pandemic</article-title><source>Mayo Clin Proc</source><year>2021</year><month>06</month><volume>96</volume><issue>6</issue><fpage>1592</fpage><lpage>1608</lpage><pub-id pub-id-type="doi">10.1016/j.mayocp.2021.04.008</pub-id><pub-id pub-id-type="medline">34088418</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lynch</surname><given-names>KE</given-names></name><name name-style="western"><surname>Viernes</surname><given-names>B</given-names></name><name name-style="western"><surname>Gatsby</surname><given-names>E</given-names></name><name name-style="western"><surname>DuVall</surname><given-names>SL</given-names></name><name name-style="western"><surname>Jones</surname><given-names>BE</given-names></name><name name-style="western"><surname>Box</surname><given-names>TL</given-names></name><etal/></person-group><article-title>Positive predictive value of COVID-19 ICD-10 diagnosis codes across calendar time and clinical setting</article-title><source>Clin Epidemiol</source><year>2021</year><month>10</month><day>27</day><volume>13</volume><fpage>1011</fpage><lpage>1018</lpage><pub-id pub-id-type="doi">10.2147/CLEP.S335621</pub-id><pub-id pub-id-type="medline">34737645</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hong</surname><given-names>C</given-names></name><name name-style="western"><surname>Zhang</surname><given-names>HG</given-names></name><name name-style="western"><surname>L&#x2019;Yi</surname><given-names>S</given-names></name><name name-style="western"><surname>Weber</surname><given-names>G</given-names></name><name name-style="western"><surname>Avillach</surname><given-names>P</given-names></name><name name-style="western"><surname>Tan</surname><given-names>BWQ</given-names></name><etal/></person-group><article-title>Changes in laboratory value improvement and mortality rates over the course of the pandemic: an international retrospective cohort study of hospitalised patients infected with SARS-CoV-2</article-title><source>BMJ Open</source><year>2022</year><month>06</month><day>23</day><volume>12</volume><issue>6</issue><fpage>e057725</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2021-057725</pub-id><pub-id pub-id-type="medline">35738646</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplemental materials regarding variable descriptions, top data elements, and the association between vaccine status and outcome metrics.</p><media xlink:href="medinform_v11i1e46267_app1.docx" xlink:title="DOCX File, 244 KB"/></supplementary-material></app-group></back></article>