<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<?covid-19-tdm?>
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i2e25457</article-id>
      <article-id pub-id-type="pmid">33449908</article-id>
      <article-id pub-id-type="doi">10.2196/25457</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Classification of the Disposition of Patients Hospitalized with COVID-19: Reading Discharge Summaries Using Natural Language Processing</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Pradhan</surname>
            <given-names>Meeta</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Park</surname>
            <given-names>Hyungjun</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Fernandes</surname>
            <given-names>Marta</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Neurology</institution>
            <institution>Massachusetts General Hospital</institution>
            <addr-line>50 Staniford St</addr-line>
            <addr-line>Boston, MA, 02114</addr-line>
            <country>United States</country>
            <phone>1 6508621154</phone>
            <email>mbentofernandes@mgh.harvard.edu</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7203-2832</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Sun</surname>
            <given-names>Haoqi</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5041-8312</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Jain</surname>
            <given-names>Aayushee</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5018-3234</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Alabsi</surname>
            <given-names>Haitham S</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6354-4679</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Brenner</surname>
            <given-names>Laura N</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6752-0471</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Ye</surname>
            <given-names>Elissa</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4851-6543</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Ge</surname>
            <given-names>Wendong</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1557-5336</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Collens</surname>
            <given-names>Sarah I</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7010-7266</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Leone</surname>
            <given-names>Michael J</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0218-8612</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author">
          <name name-style="western">
            <surname>Das</surname>
            <given-names>Sudeshna</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9486-6811</ext-link>
        </contrib>
        <contrib id="contrib11" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Robbins</surname>
            <given-names>Gregory K</given-names>
          </name>
          <degrees>MD, MPH</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7545-5817</ext-link>
        </contrib>
        <contrib id="contrib12" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Mukerji</surname>
            <given-names>Shibani S</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5677-6954</ext-link>
        </contrib>
        <contrib id="contrib13" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Westover</surname>
            <given-names>M Brandon</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4803-312X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Neurology</institution>
        <institution>Massachusetts General Hospital</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Clinical Data Animation Center</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Harvard Medical School</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Division of Pulmonary and Critical Care Medicine</institution>
        <institution>Massachusetts General Hospital</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Division of General Internal Medicine</institution>
        <institution>Massachusetts General Hospital</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Division of Infectious Diseases</institution>
        <institution>Massachusetts General Hospital</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>McCance Center for Brain Health</institution>
        <institution>Massachusetts General Hospital</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Marta Fernandes <email>mbentofernandes@mgh.harvard.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>2</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>10</day>
        <month>2</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>2</issue>
      <elocation-id>e25457</elocation-id>
      <history>
        <date date-type="received">
          <day>2</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>19</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>9</day>
          <month>12</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>12</day>
          <month>12</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Marta Fernandes, Haoqi Sun, Aayushee Jain, Haitham S Alabsi, Laura N Brenner, Elissa Ye, Wendong Ge, Sarah I Collens, Michael J Leone, Sudeshna Das, Gregory K Robbins, Shibani S Mukerji, M Brandon Westover. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 10.02.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2021/2/e25457" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Medical notes are a rich source of patient data; however, the nature of unstructured text has largely precluded the use of these data for large retrospective analyses. Transforming clinical text into structured data can enable large-scale research studies with electronic health records (EHR) data. Natural language processing (NLP) can be used for text information retrieval, reducing the need for labor-intensive chart review. Here we present an application of NLP to large-scale analysis of medical records at 2 large hospitals for patients hospitalized with COVID-19.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Our study goal was to develop an NLP pipeline to classify the discharge disposition (home, inpatient rehabilitation, skilled nursing inpatient facility [SNIF], and death) of patients hospitalized with COVID-19 based on hospital discharge summary notes.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Text mining and feature engineering were applied to unstructured text from hospital discharge summaries. The study included patients with COVID-19 discharged from 2 hospitals in the Boston, Massachusetts area (Massachusetts General Hospital and Brigham and Women’s Hospital) between March 10, 2020, and June 30, 2020. The data were divided into a training set (70%) and hold-out test set (30%). Discharge summaries were represented as bags-of-words consisting of single words (unigrams), bigrams, and trigrams. The number of features was reduced during training by excluding n-grams that occurred in fewer than 10% of discharge summaries, and further reduced using least absolute shrinkage and selection operator (LASSO) regularization while training a multiclass logistic regression model. Model performance was evaluated using the hold-out test set.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The study cohort included 1737 adult patients (median age 61 [SD 18] years; 55% men; 45% White and 16% Black; 14% nonsurvivors and 61% discharged home). The model selected 179 from a vocabulary of 1056 engineered features, consisting of combinations of unigrams, bigrams, and trigrams. The top features contributing most to the classification by the model (for each outcome) were the following: “appointments specialty,” “home health,” and “home care” (home); “intubate” and “ARDS” (inpatient rehabilitation); “service” (SNIF); “brief assessment” and “covid” (death). The model achieved a micro-average area under the receiver operating characteristic curve value of 0.98 (95% CI 0.97-0.98) and average precision of 0.81 (95% CI 0.75-0.84) in the testing set for prediction of discharge disposition.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>A supervised learning–based NLP approach is able to classify the discharge disposition of patients hospitalized with COVID-19. This approach has the potential to accelerate and increase the scale of research on patients’ discharge disposition that is possible with EHR data.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>ICU</kwd>
        <kwd>coronavirus</kwd>
        <kwd>electronic health record</kwd>
        <kwd>unstructured text</kwd>
        <kwd>natural language processing</kwd>
        <kwd>BoW</kwd>
        <kwd>LASSO</kwd>
        <kwd>feature selection</kwd>
        <kwd>machine learning</kwd>
        <kwd>intensive care unit</kwd>
        <kwd>COVID-19</kwd>
        <kwd>EHR</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The COVID-19 pandemic continues to present challenges for health care systems around the world [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref8">8</xref>], with over 32.7 million COVID-19 cases confirmed and 991,000 deaths worldwide as of September 27, 2020 [<xref ref-type="bibr" rid="ref6">6</xref>]. The SARS-CoV-2 virus first appeared in Wuhan, China, in December 2019. The first case in the United States was confirmed January 20, 2020 [<xref ref-type="bibr" rid="ref9">9</xref>], followed by rapid spread [<xref ref-type="bibr" rid="ref2">2</xref>]. By the end of April, Massachusetts became the third hardest hit state, trailing New York and New Jersey [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
      <p>To prepare for a possible second wave in Massachusetts, we set out to conduct a large-scale study of factors associated with outcomes in hospitalized patients at 2 large academic Boston hospitals. This effort required the significant task of reviewing medical records for over 1000 patients. For structured parts of the electronic health record (EHR), automated data extraction is straightforward. However, some essential information is exclusively or most reliably available only in semistructured or unstructured narrative medical notes, including patient-reported symptoms, examination findings, or social habits. Thus, developing automated approaches to EHR information extraction wherever possible is critical for more complete patient phenotyping.</p>
      <p>Natural language processing (NLP) deals with automated analysis of unstructured text data. Recent advances in NLP machine learning have empowered computers to do several tasks such as machine translation, speech recognition, speech synthesis, semantic understanding, and text summarization [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. NLP has the advantage of being much faster than human chart review of medical records [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref16">16</xref>].</p>
      <p>Here we present an automated approach, using NLP, to extract a specific outcome from hospital discharge summaries: discharge destination or “disposition” (ie, anticipated location or status following discharge). Dispositions of interest included home, inpatient rehabilitation center, skilled nursing inpatient facility (SNIF), and death. Discharge disposition of patients with COVID-19 from health care facilities is important due to the high risk of transmission of the disease within nursing homes and hospitals when patients are discharged to locations other than home, and also because it represents an important measure closely related to functional outcome and level of disability following hospitalization, as well as overall costs of care. Furthermore, this information has the potential to aid health care facilities in resource planning to better prepare for the incoming flow of patients. Although our model is tailored for discharge disposition, the approach we developed is generalizable to other outcomes available in discharge summaries.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Overview</title>
        <p>Data were extracted from the hospital electronic medical record under a research protocol approved for a waiver of informed consent by the Partners Healthcare Institutional Review Board. Clinical data were retrospectively analyzed for all adult patients who tested positive for SARS-CoV-2 infection between March 10 and June 30, 2020. A total of 1737 patients admitted to 2 major Boston hospitals, 1232 from Massachusetts General Hospital (MGH) and 505 from Brigham and Women’s Hospital (BWH), were included. Only patients with a physician discharge summary and available known ground-truth discharge disposition were included.</p>
      </sec>
      <sec>
        <title>Data Collection and Processing</title>
        <sec>
          <title>Overview</title>
          <p>Data consisted of discharge summaries, which are unstructured free-text notes written by physicians, and a ground-truth record of discharge disposition, used to assess the accuracy of the NLP results. The methodology for note preprocessing is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. The upper part of the figure provides an overview of the text extraction for each field on the list of extraction fields depicted in <xref ref-type="table" rid="table1">Table 1</xref>. The lower part of the figure shows the methodology steps where the text extracted from all the fields is processed for modeling. The data were randomly stratified into train and test sets for modeling, which we address in the Model Development section.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Methodology steps for discharge summary notes preprocessing and modeling. The list of extraction field is depicted in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
            </caption>
            <graphic xlink:href="medinform_v9i2e25457_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Information captured from discharge summaries, grouped in fields, and respective search tokens used in the regular expression.</p>
            </caption>
            <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
              <thead>
                <tr valign="top">
                  <td>Field</td>
                  <td>Search token</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Discharge disposition</td>
                  <td>“discharge,” “discharged,” “dispo,” “skilled nursing,” “snf”</td>
                </tr>
                <tr valign="top">
                  <td>Diagnosis</td>
                  <td>“diagnosis,” “diagnoses,” “problem,” “reason for admission,” “chief complaint”</td>
                </tr>
                <tr valign="top">
                  <td>Surgeries</td>
                  <td>“surgeries this admission”</td>
                </tr>
                <tr valign="top">
                  <td>Treatments</td>
                  <td>“treatments”</td>
                </tr>
                <tr valign="top">
                  <td>Tests</td>
                  <td>“tests”</td>
                </tr>
                <tr valign="top">
                  <td>Allergies</td>
                  <td>“allergies,” “allergic”</td>
                </tr>
                <tr valign="top">
                  <td>Diet</td>
                  <td>“diet,” “nutrition”</td>
                </tr>
                <tr valign="top">
                  <td>Medical history</td>
                  <td>“history”</td>
                </tr>
                <tr valign="top">
                  <td>Hospital course</td>
                  <td>“hospital course”</td>
                </tr>
                <tr valign="top">
                  <td>Laboratory results</td>
                  <td>“labs”</td>
                </tr>
                <tr valign="top">
                  <td>Activity</td>
                  <td>“activity,” “activities”</td>
                </tr>
                <tr valign="top">
                  <td>Physical exam</td>
                  <td>“discharge exam,” “physical exam”</td>
                </tr>
                <tr valign="top">
                  <td>Physical therapy</td>
                  <td>“physical therapy”</td>
                </tr>
                <tr valign="top">
                  <td>Occupational therapy</td>
                  <td>“occupational therapy”</td>
                </tr>
                <tr valign="top">
                  <td>Discharge instructions</td>
                  <td>“instructions”</td>
                </tr>
                <tr valign="top">
                  <td>Follow-up care</td>
                  <td>“follow up”</td>
                </tr>
                <tr valign="top">
                  <td>Discharge plan</td>
                  <td>“discharge plan”</td>
                </tr>
                <tr valign="top">
                  <td>Additional orders</td>
                  <td>“additional orders”</td>
                </tr>
                <tr valign="top">
                  <td>Code status</td>
                  <td>“code status”</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>Document Preprocessing</title>
          <p>Admission, discharge, and birth dates were removed from the discharge summaries, as well as punctuation, special characters, blank spaces, and numerical digits. Notes were then subjected to lowercasing, tokenization, and correction using lemmatization, a procedure for obtaining the root form of the word, using vocabulary (dictionary importance of words) and morphological (word structure and grammar relations) analysis. <italic>WordNetLemmatizer</italic> from NLTK library in Python (Version 3.7; Python Software Foundation) was used with a part-of-speech (POS) tag specified as a verb. Patients’ names, addresses, health care facilities, and hospital unit names were removed, as well as single letters. Abbreviation expansion and spelling corrections were performed for a small list of frequently used clinical words (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). A list of commonly used and less informative stopwords was also removed from the notes (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        </sec>
        <sec>
          <title>Processing of Specific Discharge Summary Fields</title>
          <p>Discharge summaries at MGH and BWH are semistructured, with a series of named fields containing specific types of mostly free-text information (<xref ref-type="table" rid="table1">Table 1</xref>). We present an example of discharge summary notes with protected health information removed (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Text fields were identified based on information extracted from the notes using regular expressions with search tokens (<xref ref-type="table" rid="table1">Table 1</xref>). The function “str.extractall” from Python was used to extract a length of 200 letters of text onwards from all instances where the search token appeared.</p>
          <p>Some notes contained a “discharge disposition” field used to list the discharge disposition. We deleted this field to avoid an overly “easy” solution, because this field is not universally available, and because we wished to assess how well the approach is able to perform when structured data is unavailable. In a field where more than one extraction was performed (ie, with more than one search token), the corresponding results were joined, and duplicated words were removed. To illustrate with an example, for the “Diet” field, using the regular expressions with search tokens “diet” and “nutrition,” 200 letters were captured for each search token, for a total of 400 letters. Since there might be repeated information in the discharge summary regarding diet and nutrition recommendations, duplicated words were removed from the captured text. Where no data was captured with the search tokens, an indication of missingness was set with the name of the field and the suffix “_missing.”</p>
          <p>The texts extracted from all fields (depicted in <xref ref-type="table" rid="table1">Table 1</xref>) were joined to create a reduced version of the discharge summary, which was then subjected to tokenization, lemmatization, and abbreviation expansion, as described in the Document Preprocessing subsection. The vocabulary used for modeling was created based on these reduced versions of the discharge summaries contained in the training set. Documents were represented as a binary bag-of-words (BoW; ie, an ordered series of binary vectors indicating whether a given n-gram [word or sequence of 2 or 3 words] is present in the document, disregarding grammar and word order). The function <italic>CountVectorizer</italic> was used with its default parameters from Python, except for the n-gram range, which was set as unigrams (1 word), bigrams (2 consecutive words), and trigrams (3 consecutive words). As a first step to reduce dimensionality, only features present in at least 10% of the reduced version of the discharge summary notes were considered. Multiclass logistic regression with the least absolute shrinkage and selection operator (LASSO) [<xref ref-type="bibr" rid="ref17">17</xref>] was used to further sparsify the model.</p>
        </sec>
      </sec>
      <sec>
        <title>Outcome Measure</title>
        <p>The multiclass outcome measure was discharge disposition, composed of the classes: home, inpatient rehabilitation, SNIF, and death. “Home” included “home or self-care,” “home-health care services,” and patients who “left against medical advice.” SNIF included “Skilled Nursing Facility” and “Custodial Care Facility.”</p>
      </sec>
      <sec>
        <title>Model Development</title>
        <p>The training algorithm used the one-vs-rest scheme for multiclassification, where a binary problem was fitted for each class and the class weight was balanced. Logistic regression [<xref ref-type="bibr" rid="ref18">18</xref>] with LASSO regularization was used as the classification model. The model estimator <inline-graphic xlink:href="medinform_v9i2e25457_fig4.png" xlink:type="simple" mimetype="image"/> is depicted in equation 1 and the LASSO regularization objective can be written as in equation 2. <inline-graphic xlink:href="medinform_v9i2e25457_fig5.png" xlink:type="simple" mimetype="image"/> corresponds to the design input matrix and <inline-graphic xlink:href="medinform_v9i2e25457_fig6.png" xlink:type="simple" mimetype="image"/> corresponds to the vector of observations, where <italic>n</italic> is the number of observations, in this case the number of discharge summaries or number of patients, and <italic>p</italic> the number of features in <inline-graphic xlink:href="medinform_v9i2e25457_fig7.png" xlink:type="simple" mimetype="image"/>. The vector of regression coefficients is given by <inline-graphic xlink:href="medinform_v9i2e25457_fig8.png" xlink:type="simple" mimetype="image"/> corresponds to the L1 norm of this coefficients vector, and λ is the regularization parameter that controls the amount of shrinkage. The regularization adds a penalty on the weights to prevent overfitting [<xref ref-type="bibr" rid="ref19">19</xref>]. The inverse of the regularization strength <italic>C</italic> was varied for the values {0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5}.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v9i2e25457_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Stratified random sampling was used to split the data set into a training set (70%) and a hold-out test set (30%). A randomized search was used for hyperparameter tuning during training with 100 iterations of 5-fold cross-validation. The solver was set to “liblinear” and the “warm start” hyperparameter was varied between true/false, where “true” corresponded to reusing the solution of the previous call to fit as initialization, and “false” corresponded to erasing the previous solution.</p>
      </sec>
      <sec>
        <title>Performance Measures</title>
        <p>The <italic>R</italic><sup>2</sup> coefficient of determination score was used in cross-validation scoring to select the best model configuration in the training data. The one standard error rule was used to select the regularization parameter. The simplest model, whose <italic>R</italic><sup>2</sup> mean score fell within 1 standard deviation of the maximum <italic>R</italic><sup>2</sup>, was selected.</p>
        <p>To measure model performance on test data, the area under the receiver operating characteristic curve (AUROC) was calculated. The ROC curve is a function of recall (sensitivity) versus the false positive rate (FPR; ie, 1–specificity; Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The pair (Recall<sub>k</sub>, FPR<sub>k</sub>) is called an operating point for this curve, where k is a threshold that is varied to generate the ROC curve. The equations for these metrics are presented in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>The area under the precision-recall curve (AUPRC), which is an important measure in the presence of class imbalance, was also calculated. The pair (Recall<sub>k</sub>, Precision<sub>k</sub>) is referred to as an operating point for this curve. Average precision (AP; Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) summarizes this plot as the weighted mean of precisions achieved at each threshold, with the increase in recall from the previous threshold used as the weight.</p>
        <p>The F<sub>1</sub>-score (Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) was also assessed as another performance metric commonly reported for data sets with imbalanced numbers across classes [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
        <p>In total, 100 iterations of bootstrap random sampling with replacement were performed to calculate 95% CIs for performance metrics.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Summary of Patient Population</title>
        <p>From 1917 patients’ medical records, 1752 had a physician discharge summary and a discharge disposition within the categories of home, inpatient rehabilitation, SNIF, and death. Only adults (aged ≥18 years) were included in the analysis, leaving a study cohort of 1737 patients. The cohort was split into train and test sets using stratified random sampling according to outcome. Age in the train and test sets was balanced, with a median of 62 and 60 years old, respectively (<xref ref-type="table" rid="table2">Table 2</xref>). The majority of patients were White (n=774; median 44.6%) and Black or African American (n=285; median 16.4%). Most were discharged home (n=1052; 60.6%). Among all patients with COVID-19 in this sample, there were 243 (14.0%) nonsurvivors.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Baseline characteristics of the study patient population stratified by train and test sets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="490"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Characteristic</td>
                <td>Train set (n=1215)</td>
                <td>Test set (n=522)</td>
                <td>Total (N=1737)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">Age (years), median (SD)</td>
                <td>62.0 (18.2)</td>
                <td>60.0 (18.2)</td>
                <td>61.0 (18.2)</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Gender, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Female</td>
                <td>545 (44.9)</td>
                <td>244 (46.7)</td>
                <td>789 (45.4)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Male</td>
                <td>670 (55.1)</td>
                <td>278 (53.3)</td>
                <td>948 (54.6)</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Race, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td/>
                <td>White</td>
                <td>533 (43.9)</td>
                <td>241 (46.2)</td>
                <td>774 (44.6)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Hispanic or Latino</td>
                <td>52 (4.2)</td>
                <td>19 (3.6)</td>
                <td>71 (4.1)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Black or African American</td>
                <td>204 (16.8)</td>
                <td>81 (15.5)</td>
                <td>285 (16.4)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Asian</td>
                <td>46 (3.8)</td>
                <td>21 (4.0)</td>
                <td>67 (3.9)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>American Indian or Alaska Native</td>
                <td>31 (2.5)</td>
                <td>13 (2.5)</td>
                <td>44 (2.5)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Native Hawaiian or other Pacific Islander</td>
                <td>2 (0.2)</td>
                <td>1 (0.2)</td>
                <td>3 (0.2)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Unknown<sup>a</sup></td>
                <td>347 (28.6)</td>
                <td>146 (28.0)</td>
                <td>493 (28.3)</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Institution, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Massachusetts General Hospital</td>
                <td>881 (72.5)</td>
                <td>351 (67.2)</td>
                <td>1232 (70.9)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Brigham and Women’s Hospital</td>
                <td>334 (27.5)</td>
                <td>171 (32.8)</td>
                <td>505 (29.1)</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Discharge disposition, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Home</td>
                <td>736 (60.6)</td>
                <td>316 (60.5)</td>
                <td>1052 (60.6)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Inpatient rehabilitation</td>
                <td>102 (8.4)</td>
                <td>44 (8.4)</td>
                <td>146 (8.4)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Skilled nursing inpatient facility</td>
                <td>207 (17.0)</td>
                <td>89 (17.1)</td>
                <td>296 (17.0)</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Death</td>
                <td>170 (14.0)</td>
                <td>73 (14.0)</td>
                <td>243 (14.0)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Unknown includes “other,” “declined,” or “unavailable.”</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The preprocessed data set for modeling was created based on the notes extracted in all fields except the “discharge disposition” and “code status” fields, as described in the Processing of Specific Discharge Summary Fields subsection. Before dimensionality reduction, where features present in at least 10% of the reduced version of the discharge summary notes were considered, there were a total of 15,182 tokens (unigrams). After applying this dimensionality reduction step, we were left with 477 tokens. With this set of tokens, 3497 combinations of n-grams were generated, leading to a total of 1056 features with duplicates removed. Thus, the total number of candidate features in the training vocabulary was 1056, including 460 unigrams, 329 bigrams, and 267 trigrams.</p>
      </sec>
      <sec>
        <title>Modeling Results</title>
        <p>The best model configuration parameters and performance results in the hold-out test set are presented in <xref ref-type="table" rid="table3">Table 3</xref> with 95% CIs. The corresponding confusion matrices normalized by precision and recall are presented in <xref rid="figure2" ref-type="fig">Figure 2</xref>. The performance discriminated by discharge outcome is presented in <xref ref-type="table" rid="table4">Table 4</xref>. Higher performance was obtained for the outcomes of home discharge and death compared to inpatient rehabilitation and SNIF discharge outcomes. The model presented higher recall (0.95) and precision (1.0) for the death outcome. Home disposition also presented high performance for these metrics. For this model, 2 deceased patients were classified as discharged home. In experiments, for models where we included the discharge disposition field, extracted from the discharge summary, all deceased patients were correctly classified. The inpatient rehabilitation outcome presented the lowest recall (0.61) and 12 patients with this outcome were incorrectly classified by the model as discharged to SNIF. The outcome of disposition to SNIF presented the lowest precision (0.68) overall and 20 patients discharged home were incorrectly predicted as discharged to SNIF. Compared to the initial set of features in the training vocabulary, the final model contained approximately 83% fewer features, with a total of 179 features. The relative importance of the top 30 model features is presented in <xref rid="figure3" ref-type="fig">Figure 3</xref>, where the importance for each feature consisted of the sum of the absolute coefficients’ values across the outcomes.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Confusion matrices for the best model evaluated in the hold-out test set normalized (A) by recall and (B) by precision. SNIF: skilled nursing inpatient facility.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e25457_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Model performance in the hold-out test set and configuration parameters.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="180"/>
            <col width="130"/>
            <col width="130"/>
            <col width="130"/>
            <col width="140"/>
            <col width="130"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Area under the receiver operating characteristic curve<sup>a</sup></td>
                <td>Accuracy<sup>a</sup></td>
                <td>Recall<sup>a</sup></td>
                <td>F<sub>1</sub> score<sup>a</sup></td>
                <td>Average precision<sup>a</sup></td>
                <td>Precision<sup>a</sup></td>
                <td>Parameters</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>0.98 (0.97-0.98)</td>
                <td>0.88 (0.85-0.90)</td>
                <td>0.88 (0.85-0.90)</td>
                <td>0.88 (0.85-0.90)</td>
                <td>0.81 (0.75-0.84)</td>
                <td>0.88 (0.85-0.90)</td>
                <td>Number of features (unigrams, bigrams, trigrams): 179 (95, 52, 32); C=0.09; warm start: true</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>The 95% CIs of bootstrapping results are in parentheses.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Relative importance of top 30 features obtained with the model coefficients estimates for (A) the sum of the absolute coefficients values and (B) the coefficients values discriminated by outcome. Coef: coefficient.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e25457_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Model performance in the hold-out test set by discharge outcome.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <thead>
              <tr valign="top">
                <td>Outcome</td>
                <td>Area under the receiver operating characteristic curve<sup>a</sup></td>
                <td>Accuracy<sup>a</sup></td>
                <td>Recall<sup>a</sup></td>
                <td>F<sub>1</sub> score<sup>a</sup></td>
                <td>Average precision<sup>a</sup></td>
                <td>Precision<sup>a</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Home</td>
                <td>0.97 (0.95-0.98)</td>
                <td>0.92 (0.89-0.94)</td>
                <td>0.92 (0.89-0.95)</td>
                <td>0.93 (0.91-0.95)</td>
                <td>0.92 (0.88-0.94)</td>
                <td>0.94 (0.91-0.97)</td>
              </tr>
              <tr valign="top">
                <td>Rehab</td>
                <td>0.95 (0.91-0.98)</td>
                <td>0.95 (0.93-0.97)</td>
                <td>0.61 (0.53-0.76)</td>
                <td>0.67 (0.53-0.78)</td>
                <td>0.48 (0.32-0.64)</td>
                <td>0.73 (0.58-0.86)</td>
              </tr>
              <tr valign="top">
                <td>Skilled nursing inpatient facility</td>
                <td>0.93 (0.88-0.96)</td>
                <td>0.90 (0.87-0.92)</td>
                <td>0.81 (0.72-0.88)</td>
                <td>0.74 (0.64-0.79)</td>
                <td>0.58 (0.46-0.66)</td>
                <td>0.68 (0.58-0.75)</td>
              </tr>
              <tr valign="top">
                <td>Death</td>
                <td>1.00 (1.00-1.00)</td>
                <td>0.99 (0.99-1.00)</td>
                <td>0.95 (0.90-0.98)</td>
                <td>0.97 (0.95-0.99)</td>
                <td>0.95 (0.91-0.98)</td>
                <td>1.00 (1.00-1.00)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>The 95% CIs of bootstrapping results are in parentheses.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>“Service” was the feature assigned the highest importance for classification of the discharge outcomes. For inpatient rehabilitation and SNIF dispositions, the coefficient values for this feature are positive, which indicates that this term will most likely appear in the preprocessed notes for both outcomes. “Home care,” “healthcare home,” and “home health” were assigned a positive coefficient value for home disposition. “Service healthcare home” was also assigned high importance for this outcome, suggesting that this feature is related to patients discharged home with home health care services provided. “Medicine” and “appointments specialty” were also important for this outcome. “Rehab” had positive coefficients for both inpatient rehabilitation and SNIF dispositions. “Intubate” and “ARDS” (acute respiratory distress syndrome) are important features for inpatient rehabilitation disposition. For death, “discharge” and “activity tolerate” presented negative coefficient values, indicating that these features are unlikely to appear in discharge summaries of deceased patients. “Brief assessment” and “brief” are assigned high coefficient values for this outcome. “Covid” was assigned a positive coefficient value for predicting death, while the term was given negative values for inpatient rehabilitation and SNIF.</p>
        <p>Training performance is depicted in Figure S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, with the curve corresponding to the <italic>R</italic><sup>2</sup> scores for the different values of the inversed regularization strength. The top 15 features and their relative importance obtained with LASSO regularization are presented for each outcome (Figure S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Blue bars correspond to features with positive coefficient values and red bars to features with negative coefficient values. The areas under the ROC and Precision-Recall curves for the best model are also presented (Figure S3 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). We also assessed how the model performance and the features selected as the most important in the train set varied with the dimension of the train set (Figure S4 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). The hold-out test set for model evaluation was fixed and the train set dimension was varied from 10% to 100% of the original train set, with 1215 patients. We observed that the best performance was achieved with a higher number of patients in the train set (ie, the original train set of 100%). However, with 50% versus 100% of the original train set, the model achieved good performance for 1018 (versus 1056) vocabulary features (AUROC 0.97 versus 0.98 and AP 0.79 versus 0.81, respectively). We assessed the common features between each train set and the original train set (Figure S5 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Among the top 30 features, there were 10 common features between the 50% and the original train sets. A higher number of common features was found for the train set with 90% of the original train set, with a total of 17 common features. Finally, we observed that more than half of the features in the top 30 from the original train set were selected as top 30 in at least two train sets (Figure S6 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this study, a machine learning–based NLP pipeline was developed to classify the discharge disposition of adult patients hospitalized with COVID-19. The model achieved near-perfect identification of patients with outcomes of home disposition or death. For the intermediate outcomes of inpatient rehabilitation or SNIF, performance was imperfect but also acceptable. Due to this classification task being relatively easy, more complex and time-consuming modeling approaches, such as recurrent neural networks or bidirectional encoder representations from transformers were not considered. We acknowledge that for harder tasks, these approaches can improve performance. The final method is automated, thus enabling large-scale rapid processing of thousands of discharge summaries, a task that is infeasible when relying on manual chart review.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>The present analysis was limited to a cohort of patients with COVID-19, who may have specific medical symptoms related to the disease. Therefore, as future work, it is proposed to extend the model to other cohorts. Further, although results spanned 2 hospitals, they are located in the same geographic region (Boston, Massachusetts). Thus, our cohort may not be representative of other US and non-US populations. Moreover, decision making for discharge disposition may vary for different hospitals, according to the number of SNIFs or rehabilitation centers in the geographic area, which may affect the generalizability of the model. The models were developed with textual information from discharge summaries, while the addition of other clinical features (eg, physical or occupational therapy reports, social work or case manager notes) was not considered, which is a limitation of the study and can be pursued in future work.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>Extraction of information from clinical narratives is a growing application of NLP in health care. NLP has been used to extract information from hospital discharge notes about medical conditions such as postsurgical sepsis [<xref ref-type="bibr" rid="ref21">21</xref>], pneumonia [<xref ref-type="bibr" rid="ref22">22</xref>], or other potential medical problems [<xref ref-type="bibr" rid="ref23">23</xref>], as well as to identify critical illness [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], detect adverse events [<xref ref-type="bibr" rid="ref26">26</xref>], predict risk of rehospitalization [<xref ref-type="bibr" rid="ref27">27</xref>], extract medication information [<xref ref-type="bibr" rid="ref28">28</xref>], and risk stratify patients [<xref ref-type="bibr" rid="ref29">29</xref>]. To the best of our knowledge, ours is the first work on classifying hospital discharge disposition based on discharge summary notes using machine learning and NLP.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study shows that a supervised learning–based NLP approach can be used to accurately classify the discharge disposition of hospitalized patients with COVID-19 in an automated fashion. This model, and the NLP approach used to develop it, have the potential to accelerate and increase the scale of research that is possible with EHR data.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Methodology.</p>
        <media xlink:href="medinform_v9i2e25457_app1.doc" xlink:title="DOC File , 555 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Results.</p>
        <media xlink:href="medinform_v9i2e25457_app2.doc" xlink:title="DOC File , 399 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AP</term>
          <def>
            <p>average precision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ARDS</term>
          <def>
            <p>acute respiratory distress syndrome</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AUPRC</term>
          <def>
            <p>area under the precision recall curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">AUROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">BoW</term>
          <def>
            <p>bag-of-words</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">BWH</term>
          <def>
            <p>Brigham and Women’s Hospital</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">FPR</term>
          <def>
            <p>false positive rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">ICU</term>
          <def>
            <p>intensive care unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">LASSO</term>
          <def>
            <p>least absolute shrinkage and selection operator</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">MGH</term>
          <def>
            <p>Massachusetts General Hospital</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">ROC</term>
          <def>
            <p>receiver operating characteristic</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">SNIF</term>
          <def>
            <p>skilled nursing inpatient facility</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>MF and HS were co-first authors of this paper, and GKR, SSM, and MBW were co-senior authors. MBW was supported by the Glenn Foundation for Medical Research and American Federation for Aging Research (Breakthroughs in Gerontology Grant); American Academy of Sleep Medicine (AASM Foundation Strategic Research Award); Football Players Health Study (FPHS) at Harvard University; Department of Defense through a subcontract from Moberg ICU Solutions Inc; and the National Institutes of Health (1R01NS102190, 1R01NS102574, 1R01NS107291, 1RF1AG064312). MBW is a cofounder of Beacon Biosignals.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Richardson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hirsch</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Narasimhan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Crawford</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>McGinn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Davidson</surname>
              <given-names>KW</given-names>
            </name>
            <collab>the Northwell COVID-19 Research Consortium</collab>
            <name name-style="western">
              <surname>Barnaby</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Becker</surname>
              <given-names>LB</given-names>
            </name>
            <name name-style="western">
              <surname>Chelico</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Cookingham</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Coppa</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Diefenbach</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Dominello</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Duer-Hefele</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Falzon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gitlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hajizadeh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Harvin</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Hirschwerk</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kozel</surname>
              <given-names>ZM</given-names>
            </name>
            <name name-style="western">
              <surname>Marrast</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Mogavero</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Osorio</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Qiu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zanos</surname>
              <given-names>TP</given-names>
            </name>
          </person-group>
          <article-title>Presenting Characteristics, Comorbidities, and Outcomes Among 5700 Patients Hospitalized With COVID-19 in the New York City Area</article-title>
          <source>JAMA</source>
          <year>2020</year>
          <month>05</month>
          <day>26</day>
          <volume>323</volume>
          <issue>20</issue>
          <fpage>2052</fpage>
          <lpage>2059</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32320003"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2020.6775</pub-id>
          <pub-id pub-id-type="medline">32320003</pub-id>
          <pub-id pub-id-type="pii">2765184</pub-id>
          <pub-id pub-id-type="pmcid">PMC7177629</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Holshue</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>DeBolt</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lindquist</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lofy</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Wiesman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bruce</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Spitters</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ericson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wilkerson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tural</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cohn</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fox</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gerber</surname>
              <given-names>SI</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tong</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lindstrom</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pallansch</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Weldon</surname>
              <given-names>WC</given-names>
            </name>
            <name name-style="western">
              <surname>Biggs</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>Uyeki</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Pillai</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>First Case of 2019 Novel Coronavirus in the United States</article-title>
          <source>N Engl J Med</source>
          <year>2020</year>
          <month>03</month>
          <day>05</day>
          <volume>382</volume>
          <issue>10</issue>
          <fpage>929</fpage>
          <lpage>936</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmoa2001191</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fauci</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lane</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Redfield</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Covid-19 - Navigating the Uncharted</article-title>
          <source>N Engl J Med</source>
          <year>2020</year>
          <month>03</month>
          <day>26</day>
          <volume>382</volume>
          <issue>13</issue>
          <fpage>1268</fpage>
          <lpage>1269</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32109011"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/NEJMe2002387</pub-id>
          <pub-id pub-id-type="medline">32109011</pub-id>
          <pub-id pub-id-type="pmcid">PMC7121221</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Guan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Clinical course and risk factors for mortality of adult inpatients with COVID-19 in Wuhan, China: a retrospective cohort study</article-title>
          <source>The Lancet</source>
          <year>2020</year>
          <month>03</month>
          <volume>395</volume>
          <issue>10229</issue>
          <fpage>1054</fpage>
          <lpage>1062</lpage>
          <pub-id pub-id-type="doi">10.1016/s0140-6736(20)30566-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>McGoogan</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Characteristics of and Important Lessons From the Coronavirus Disease 2019 (COVID-19) Outbreak in China: Summary of a Report of 72 314 Cases From the Chinese Center for Disease Control and Prevention</article-title>
          <source>JAMA</source>
          <year>2020</year>
          <month>04</month>
          <day>07</day>
          <volume>323</volume>
          <issue>13</issue>
          <fpage>1239</fpage>
          <lpage>1242</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2020.2648</pub-id>
          <pub-id pub-id-type="medline">32091533</pub-id>
          <pub-id pub-id-type="pii">2762130</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>CDC COVID-19 Response Team</collab>
          </person-group>
          <article-title>Severe Outcomes Among Patients with Coronavirus Disease 2019 (COVID-19) - United States, February 12-March 16, 2020</article-title>
          <source>MMWR Morb Mortal Wkly Rep</source>
          <year>2020</year>
          <month>03</month>
          <day>27</day>
          <volume>69</volume>
          <issue>12</issue>
          <fpage>343</fpage>
          <lpage>346</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.15585/mmwr.mm6912e2"/>
          </comment>
          <pub-id pub-id-type="doi">10.15585/mmwr.mm6912e2</pub-id>
          <pub-id pub-id-type="medline">32214079</pub-id>
          <pub-id pub-id-type="pmcid">PMC7725513</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lau</surname>
              <given-names>EHY</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lau</surname>
              <given-names>YC</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>JY</given-names>
            </name>
            <name name-style="western">
              <surname>Guan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Mo</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Cowling</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Leung</surname>
              <given-names>GM</given-names>
            </name>
          </person-group>
          <article-title>Temporal dynamics in viral shedding and transmissibility of COVID-19</article-title>
          <source>Nat Med</source>
          <year>2020</year>
          <month>05</month>
          <day>15</day>
          <volume>26</volume>
          <issue>5</issue>
          <fpage>672</fpage>
          <lpage>675</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-020-0869-5</pub-id>
          <pub-id pub-id-type="medline">32296168</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-020-0869-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lauer</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Grantz</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Bi</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Meredith</surname>
              <given-names>HR</given-names>
            </name>
            <name name-style="western">
              <surname>Azman</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Reich</surname>
              <given-names>NG</given-names>
            </name>
            <name name-style="western">
              <surname>Lessler</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application</article-title>
          <source>Ann Intern Med</source>
          <year>2020</year>
          <month>05</month>
          <day>05</day>
          <volume>172</volume>
          <issue>9</issue>
          <fpage>577</fpage>
          <lpage>582</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.acpjournals.org/doi/10.7326/M20-0504?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.7326/M20-0504</pub-id>
          <pub-id pub-id-type="medline">32150748</pub-id>
          <pub-id pub-id-type="pii">2762808</pub-id>
          <pub-id pub-id-type="pmcid">PMC7081172</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Clinical Characteristics of 138 Hospitalized Patients With 2019 Novel Coronavirus-Infected Pneumonia in Wuhan, China</article-title>
          <source>JAMA</source>
          <year>2020</year>
          <month>03</month>
          <day>17</day>
          <volume>323</volume>
          <issue>11</issue>
          <fpage>1061</fpage>
          <lpage>1069</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32031570"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2020.1585</pub-id>
          <pub-id pub-id-type="medline">32031570</pub-id>
          <pub-id pub-id-type="pii">2761044</pub-id>
          <pub-id pub-id-type="pmcid">PMC7042881</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="web">
          <article-title>The Covid-19 Tracker</article-title>
          <source>STAT</source>
          <access-date>2021-01-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.statnews.com/feature/coronavirus/covid-19-tracker/">https://www.statnews.com/feature/coronavirus/covid-19-tracker/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nallapati</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Nogueira dos santos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gulcehre</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Abstractive text summarization using sequence-to-sequence RNNs and beyond</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 19, 2016.
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1602.06023"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/k16-1028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirschberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
          </person-group>
          <article-title>Advances in natural language processing</article-title>
          <source>Science</source>
          <year>2015</year>
          <month>07</month>
          <day>17</day>
          <volume>349</volume>
          <issue>6245</issue>
          <fpage>261</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1126/science.aaa8685</pub-id>
          <pub-id pub-id-type="medline">26185244</pub-id>
          <pub-id pub-id-type="pii">349/6245/261</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilbur</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Rzhetsky</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shatkay</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>New directions in biomedical text annotation: definitions, guidelines and corpus construction</article-title>
          <source>BMC Bioinformatics</source>
          <year>2006</year>
          <month>07</month>
          <day>25</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>356</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-7-356"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-7-356</pub-id>
          <pub-id pub-id-type="medline">16867190</pub-id>
          <pub-id pub-id-type="pii">1471-2105-7-356</pub-id>
          <pub-id pub-id-type="pmcid">PMC1559725</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Buchan</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpal</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Alatorre</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gudivada</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sanseau</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Koehler</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>The role of translational bioinformatics in drug discovery</article-title>
          <source>Drug Discov Today</source>
          <year>2011</year>
          <month>05</month>
          <volume>16</volume>
          <issue>9-10</issue>
          <fpage>426</fpage>
          <lpage>34</lpage>
          <pub-id pub-id-type="doi">10.1016/j.drudis.2011.03.002</pub-id>
          <pub-id pub-id-type="medline">21402166</pub-id>
          <pub-id pub-id-type="pii">S1359-6446(11)00074-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nadkarni</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Ohno-Machado</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing: an introduction</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2011</year>
          <month>09</month>
          <day>01</day>
          <volume>18</volume>
          <issue>5</issue>
          <fpage>544</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21846786"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000464</pub-id>
          <pub-id pub-id-type="medline">21846786</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2011-000464</pub-id>
          <pub-id pub-id-type="pmcid">PMC3168328</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>Ö</given-names>
            </name>
            <name name-style="western">
              <surname>South</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>DuVall</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>2010 i2b2/VA challenge on concepts, assertions, and relations in clinical text</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2011</year>
          <volume>18</volume>
          <issue>5</issue>
          <fpage>552</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21685143"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000203</pub-id>
          <pub-id pub-id-type="medline">21685143</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2011-000203</pub-id>
          <pub-id pub-id-type="pmcid">PMC3168320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Regression Shrinkage and Selection Via the Lasso</article-title>
          <source>Journal of the Royal Statistical Society: Series B (Methodological)</source>
          <year>2018</year>
          <month>12</month>
          <day>05</day>
          <volume>58</volume>
          <issue>1</issue>
          <fpage>267</fpage>
          <lpage>288</lpage>
          <pub-id pub-id-type="doi">10.1111/j.2517-6161.1996.tb02080.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cramer</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>The Origins of Logistic Regression</article-title>
          <source>SSRN Journal</source>
          <year>2003</year>
          <month>01</month>
          <day>25</day>
          <pub-id pub-id-type="doi">10.2139/ssrn.360300</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bühlmann</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>van de Geer</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>Statistics for High-Dimensional Data: Methods, Theory and Applications</source>
          <year>2011</year>
          <publisher-loc>Heidelberg, Germany</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Azari</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Janeja</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Levin</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Imbalanced learning to predict long stay Emergency Department patients</article-title>
          <year>2015</year>
          <conf-name>2015 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>Washington, DC</conf-loc>
          <fpage>807</fpage>
          <lpage>814</lpage>
          <pub-id pub-id-type="doi">10.1109/BIBM.2015.7359790</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arvind</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ukogu</surname>
              <given-names>CO</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>Wednesday, September 26, 2018 2:00 PM – 3:00 PM Integrating Technology into Practice: 59. Natural language processing of electronic medical records can identify sepsis following orthopedic surgery</article-title>
          <source>The Spine Journal</source>
          <year>2018</year>
          <month>08</month>
          <conf-name>Proceedings of the 33rd Annual Meeting of the North American Spine Society</conf-name>
          <conf-date>September 26-29, 2018</conf-date>
          <conf-loc>Los Angeles, CA</conf-loc>
          <fpage>S29</fpage>
          <pub-id pub-id-type="doi">10.1016/j.spinee.2018.06.068</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yetisgen-Yildiz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Glavan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Vanderwende</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wurfel</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Identifying patients with pneumonia from free-text intensive care unit reports</article-title>
          <year>2011</year>
          <conf-name>ICML 2011 Workshop on Learning from Unstructured Clinical Text</conf-name>
          <conf-date>2011</conf-date>
          <conf-loc>Bellevue, WA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://faculty.washington.edu/melihay/publications/ICML_2011.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meystre</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Haug</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Improving the sensitivity of the problem list in an intensive care unit by using natural language processing</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2006</year>
          <fpage>554</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17238402"/>
          </comment>
          <pub-id pub-id-type="medline">17238402</pub-id>
          <pub-id pub-id-type="pii">85533</pub-id>
          <pub-id pub-id-type="pmcid">PMC1839473</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weissman</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Harhay</surname>
              <given-names>MO</given-names>
            </name>
            <name name-style="western">
              <surname>Lugo</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Fuchs</surname>
              <given-names>BD</given-names>
            </name>
            <name name-style="western">
              <surname>Halpern</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Mikkelsen</surname>
              <given-names>ME</given-names>
            </name>
          </person-group>
          <article-title>Natural Language Processing to Assess Documentation of Features of Critical Illness in Discharge Documents of Acute Respiratory Distress Syndrome Survivors</article-title>
          <source>Annals ATS</source>
          <year>2016</year>
          <month>09</month>
          <volume>13</volume>
          <issue>9</issue>
          <fpage>1538</fpage>
          <lpage>1545</lpage>
          <pub-id pub-id-type="doi">10.1513/annalsats.201602-131oc</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marafino</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Davies</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Thombley</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Luft</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Sing</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Kazi</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>DeJong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Boscardin</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Dudley</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>Validation of Prediction Models for Critical Care Outcomes Using Natural Language Processing of Electronic Health Record Data</article-title>
          <source>JAMA Netw Open</source>
          <year>2018</year>
          <month>12</month>
          <day>07</day>
          <volume>1</volume>
          <issue>8</issue>
          <fpage>e185097</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jamanetwork.com/journals/jamanetworkopen/fullarticle/10.1001/jamanetworkopen.2018.5097"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2018.5097</pub-id>
          <pub-id pub-id-type="medline">30646310</pub-id>
          <pub-id pub-id-type="pii">2719128</pub-id>
          <pub-id pub-id-type="pmcid">PMC6324323</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murff</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Forster</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Fiskio</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Heiman</surname>
              <given-names>HL</given-names>
            </name>
            <name name-style="western">
              <surname>Bates</surname>
              <given-names>DW</given-names>
            </name>
          </person-group>
          <article-title>Electronically screening discharge summaries for adverse medical events</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2003</year>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>339</fpage>
          <lpage>50</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/12668691"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M1201</pub-id>
          <pub-id pub-id-type="medline">12668691</pub-id>
          <pub-id pub-id-type="pii">M1201</pub-id>
          <pub-id pub-id-type="pmcid">PMC181984</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hurdle</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Predictive Model for Risk of 30-Day Rehospitalization Using a Natural Language Processing/Machine Learning Approach Among Medicare Patients with Heart Failure</article-title>
          <source>Journal of Cardiac Failure</source>
          <year>2020</year>
          <month>10</month>
          <volume>26</volume>
          <issue>10</issue>
          <fpage>S5</fpage>
          <pub-id pub-id-type="doi">10.1016/j.cardfail.2020.09.023</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Automatic extraction of medication information from medical discharge summaries</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <month>09</month>
          <day>01</day>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>545</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20819861"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2010.003863</pub-id>
          <pub-id pub-id-type="medline">20819861</pub-id>
          <pub-id pub-id-type="pii">17/5/545</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995675</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Saeed</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Long</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Risk stratification of ICU patients using topic models inferred from unstructured progress notes</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2012</year>
          <volume>2012</volume>
          <fpage>505</fpage>
          <lpage>11</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23304322"/>
          </comment>
          <pub-id pub-id-type="medline">23304322</pub-id>
          <pub-id pub-id-type="pmcid">PMC3540429</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
