<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<?covid-19-tdm?>
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i2e21679</article-id>
      <article-id pub-id-type="pmid">33544689</article-id>
      <article-id pub-id-type="doi">10.2196/21679</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Lexicon Development for COVID-19-related Concepts Using Open-source Word Embedding Sources: An Intrinsic and Extrinsic Evaluation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhang</surname>
            <given-names>Yijia</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chen</surname>
            <given-names>Qingyu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Parikh</surname>
            <given-names>Soham</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1621-7645</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Davoudi</surname>
            <given-names>Anahita</given-names>
          </name>
          <degrees>MS, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4345-3889</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Shun</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0721-5429</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Giraldo</surname>
            <given-names>Carolina</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8990-2136</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Schriver</surname>
            <given-names>Emily</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4522-1029</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Mowery</surname>
            <given-names>Danielle</given-names>
          </name>
          <degrees>MS, PhD</degrees>
          <xref rid="aff6" ref-type="aff">6</xref>
          <address>
            <institution>Department of Biostatistics, Epidemiology, &#38; Informatics</institution>
            <institution>Institute for Biomedical Informatics</institution>
            <institution>University of Pennsylvania</institution>
            <addr-line>A206 Richards Hall,</addr-line>
            <addr-line>3700 Hamilton Walk</addr-line>
            <addr-line>Philadelphia, PA, 19104-6021</addr-line>
            <country>United States</country>
            <phone>1 215 746 6677</phone>
            <email>dlmowery@pennmedicine.upenn.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3802-4457</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Engineering and Applied Science</institution>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Biostatistics, Epidemiology, &#38; Informatics</institution>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Division of Hematology/Oncology</institution>
        <institution>Department of Medicine</institution>
        <institution>Hospital of the University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Philadelphia College of Osteopathic Medicine</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Data Analytics Center</institution>
        <institution>Penn Medicine</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Department of Biostatistics, Epidemiology, &#38; Informatics</institution>
        <institution>Institute for Biomedical Informatics</institution>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Danielle Mowery <email>dlmowery@pennmedicine.upenn.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>2</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>22</day>
        <month>2</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>2</issue>
      <elocation-id>e21679</elocation-id>
      <history>
        <date date-type="received">
          <day>21</day>
          <month>6</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>15</day>
          <month>7</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>20</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>31</day>
          <month>1</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Soham Parikh, Anahita Davoudi, Shun Yu, Carolina Giraldo, Emily Schriver, Danielle Mowery. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 22.02.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2021/2/e21679" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Scientists are developing new computational methods and prediction models to better clinically understand COVID-19 prevalence, treatment efficacy, and patient outcomes. These efforts could be improved by leveraging documented COVID-19–related symptoms, findings, and disorders from clinical text sources in an electronic health record. Word embeddings can identify terms related to these clinical concepts from both the biomedical and nonbiomedical domains, and are being shared with the open-source community at large. However, it’s unclear how useful openly available word embeddings are for developing lexicons for COVID-19–related concepts.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Given an initial lexicon of COVID-19–related terms, this study aims to characterize the returned terms by similarity across various open-source word embeddings and determine common semantic and syntactic patterns between the COVID-19 queried terms and returned terms specific to the word embedding source.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We compared seven openly available word embedding sources. Using a series of COVID-19–related terms for associated symptoms, findings, and disorders, we conducted an interannotator agreement study to determine how accurately the most similar returned terms could be classified according to semantic types by three annotators. We conducted a qualitative study of COVID-19 queried terms and their returned terms to detect informative patterns for constructing lexicons. We demonstrated the utility of applying such learned synonyms to discharge summaries by reporting the proportion of patients identified by concept among three patient cohorts: pneumonia (n=6410), acute respiratory distress syndrome (n=8647), and COVID-19 (n=2397).</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We observed high pairwise interannotator agreement (Cohen kappa) for symptoms (0.86-0.99), findings (0.93-0.99), and disorders (0.93-0.99). Word embedding sources generated based on characters tend to return more synonyms (mean count of 7.2 synonyms) compared to token-based embedding sources (mean counts range from 2.0 to 3.4). Word embedding sources queried using a qualifier term (eg, dry cough or muscle pain) more often returned qualifiers of the similar semantic type (eg, “dry” returns consistency qualifiers like “wet” and “runny”) compared to a single term (eg, cough or pain) queries. A higher proportion of patients had documented fever (0.61-0.84), cough (0.41-0.55), shortness of breath (0.40-0.59), and hypoxia (0.51-0.56) retrieved than other clinical features. Terms for dry cough returned a higher proportion of patients with COVID-19 (0.07) than the pneumonia (0.05) and acute respiratory distress syndrome (0.03) populations.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Word embeddings are valuable technology for learning related terms, including synonyms. When leveraging openly available word embedding sources, choices made for the construction of the word embeddings can significantly influence the words learned.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>word embedding</kwd>
        <kwd>COVID-19</kwd>
        <kwd>intrinsic</kwd>
        <kwd>open-source</kwd>
        <kwd>computation</kwd>
        <kwd>model</kwd>
        <kwd>prediction</kwd>
        <kwd>semantic</kwd>
        <kwd>syntactic</kwd>
        <kwd>pattern</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>COVID-19 has become a pandemic that is felt throughout the world. Scientists are developing new methods for determining infection rates, disease burden, treatment efficacy, and patient outcomes [<xref ref-type="bibr" rid="ref1">1</xref>]. Our ability to detect and phenotype patients with COVID-19 and controls for clinical and translational studies requires clinical symptomatology, radiological imaging, laboratory tests, and associated disorders derived from electronic health record data [<xref ref-type="bibr" rid="ref2">2</xref>]. Much of this information can be locked within the electronic health record clinical notes [<xref ref-type="bibr" rid="ref3">3</xref>]. To accurately characterize each patient’s COVID-19 profile for study, we must develop natural language processing systems to reliably extract COVID-19–related information. One of the first steps to extracting this information is developing lexicons with adequate coverage for all synonyms describing each COVID-19 concept. In the clinical domain, lexicons have been developed using several techniques: standardized vocabularies [<xref ref-type="bibr" rid="ref4">4</xref>], lexico-syntactic patterns [<xref ref-type="bibr" rid="ref5">5</xref>], term expansion [<xref ref-type="bibr" rid="ref6">6</xref>], and distributional semantics [<xref ref-type="bibr" rid="ref7">7</xref>]. Moreover, word embedding technologies have become increasingly popular for identifying semantically and syntactically-related terms within vector spaces by assessing the <italic>distributional hypothesis</italic> that “words that share a common, relative vector space will often also share a common, semantic relatedness” [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      </sec>
      <sec>
        <title>Word Embeddings</title>
        <p>Word embeddings represent a word in a vector space while preserving its contextualized usage. Word embeddings have been leveraged to learn synonyms to develop lexicons [<xref ref-type="bibr" rid="ref8">8</xref>]. These vectors are commonly learned by training algorithms like Word2Vec [<xref ref-type="bibr" rid="ref9">9</xref>], FastText [<xref ref-type="bibr" rid="ref10">10</xref>], and global vectors for word representation (GloVe) [<xref ref-type="bibr" rid="ref11">11</xref>] on large corpora, including domain-independent texts (eg, internet web pages like Wikipedia and CommonCrawl, and social media like Twitter and Reddit) and domain-dependent texts (eg, clinical notes like the Medical Information Mart for Intensive Care III [MIMIC III] database notes [<xref ref-type="bibr" rid="ref12">12</xref>] and biomedical research articles like PubMed). These domain-dependent embeddings may capture richer biomedical information than domain-independent embeddings (eg, <italic>standard GloVe embeddings</italic>) and are becoming increasingly available to the community at large. For example, <italic>BioASQ</italic> released their embeddings trained using the Word2Vec algorithm on 11 million biomedical abstracts from PubMed [<xref ref-type="bibr" rid="ref13">13</xref>]. Pyysalo et al [<xref ref-type="bibr" rid="ref14">14</xref>] trained embeddings using Word2Vec on a combination of PubMed and PubMed Central articles along with Wikipedia to combine open domain and biomedical knowledge (<italic>biomedical natural language processing</italic> [<italic>BioNLP</italic>] corpus). Zhang et al [<xref ref-type="bibr" rid="ref15">15</xref>] (<italic>BioWordVec</italic> corpus) and Flamholz et al [<xref ref-type="bibr" rid="ref16">16</xref>] (<italic>ClinicalEmbeddings</italic> corpus) also leveraged PubMed and PubMed Central articles in addition to clinical notes from the MIMIC III to train embeddings using the FastText, GloVe, and Word2Vec algorithms [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
      </sec>
      <sec>
        <title>Word Embedding Evaluations</title>
        <p>Systematic evaluations of word embeddings can be broadly classified into two categories, <italic>intrinsic</italic> and <italic>extrinsic</italic> evaluations. Intrinsic evaluations typically evaluate these word embeddings against human annotations by measuring the similarity or relationship between the queried and returned word pairs. Pakhomov et al [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>] and Pedersen et al [<xref ref-type="bibr" rid="ref19">19</xref>] have developed data sets containing pairs of biomedical terms along with their degree of relatedness as rated by human annotators. Furthermore, Pakhomov et al [<xref ref-type="bibr" rid="ref17">17</xref>] and Hliaoutakis [<xref ref-type="bibr" rid="ref20">20</xref>] have annotated pairs of medical terms for their semantic similarity. One intrinsic evaluation for validating these human annotations entails computing the Spearman coefficient between word pairs. Others have intrinsically evaluated word embeddings by clustering biomedical terms from the Unified Medical Language System and Ranker [<xref ref-type="bibr" rid="ref21">21</xref>], and assessing the cluster quality using metrics like the Davies-Bouldin index and the Dunn index. Word embeddings have advanced the state of the art for many intrinsic natural language processing subtasks (ie, reading comprehension [<xref ref-type="bibr" rid="ref22">22</xref>], natural language inference [<xref ref-type="bibr" rid="ref23">23</xref>], text summarization [<xref ref-type="bibr" rid="ref24">24</xref>], vocabulary development [<xref ref-type="bibr" rid="ref8">8</xref>], and document classification [<xref ref-type="bibr" rid="ref25">25</xref>]). An extrinsic or summative evaluation of clinical word embeddings can involve evaluating the performance of machine learning models by using word embeddings to complete a biomedical research task or clinical operation such as patient phenotyping [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], patient fall prediction [<xref ref-type="bibr" rid="ref25">25</xref>], and patient hospital readmission prediction [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
      </sec>
      <sec>
        <title>COVID-19 and Word Embeddings</title>
        <p>In recent years, there has been extensive work to leverage biomedical and clinical texts for developing clinical word embeddings to create concept lexicons [<xref ref-type="bibr" rid="ref29">29</xref>]. For example, clinical word embeddings have been trained to identify drugs [<xref ref-type="bibr" rid="ref30">30</xref>], substance abuse terms [<xref ref-type="bibr" rid="ref8">8</xref>], and anatomical locations [<xref ref-type="bibr" rid="ref16">16</xref>]. More recently, word embeddings have been used to understand the COVID-19 pandemic. For example, Schild et al [<xref ref-type="bibr" rid="ref31">31</xref>] trained word2vec models for learning terms related to “virus” (“corona,” “covid,” “wuflu,” “coronovirus,” “coronavirus”) for understanding the emergence of sinophobic behavior on web communities like Twitter and 4chan’s /pol/ facing COVID-19 outbreaks. Klein et al [<xref ref-type="bibr" rid="ref32">32</xref>] applied pretrained Bidirectional Encoder Representations from Transformers to identify Twitter users with probable or possible COVID-19 infection using their self-reported Twitter messages and temporal-spatial information. However, to our knowledge, there has been no intrinsic evaluation of openly available word embeddings to identify COVID-19 terms related to symptoms, findings, and disorder concepts for encoding clinical notes.</p>
        <p>Our long-term goal is to develop a COVID-19 information extraction system to support a variety of purposes, including clinical and translational research, observational studies, clinical trials, public health monitoring, and hospital capacity monitoring. Our short-term goal is to conduct an <italic>intrinsic evaluation</italic> to qualitatively analyze and compare various openly available word embedding sources by categorizing the most similar words returned for symptoms, findings, and disorders related to COVID-19, and to identify common patterns between returned terms and their associated COVID-19 query terms to better understand which of these word embedding sources and their configurations could support synonym discovery. An additional short term goal is to conduct an <italic>extrinsic evaluation</italic> to apply these terms and their learned synonyms to the discharge summaries of patients with pneumonia, acute respiratory distress syndrome (ARDS), and COVID-19, and report the proportion of patients identified, with terms representing each concept for each disorder cohort.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>In this University of Pennsylvania Institute Review Board–approved study (#831895, #843620), we conducted a literature review of open-source word embeddings. We identified 7 publicly available sources and characterized each resource according to the training source, unit of processing, context window embedding technology, preprocessing, embedding technology used, returned units, embedding size, and vocabulary size (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Description of word embedding sources used.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="110"/>
          <col width="110"/>
          <col width="100"/>
          <col width="100"/>
          <col width="90"/>
          <col width="110"/>
          <col width="110"/>
          <col width="90"/>
          <col width="110"/>
          <col width="70"/>
          <thead>
            <tr valign="top">
              <td>Name</td>
              <td>Author and source</td>
              <td>Training source</td>
              <td>Unit</td>
              <td>Context window</td>
              <td>Preprocess (reduce case, remove stop words, term types)</td>
              <td>Embedding technology (gensim, FastText, GloVe<sup>a</sup>, BERT<sup>b</sup>, ELMO, etc)</td>
              <td>Returned unit (1-3 ngrams)</td>
              <td>Embedding size</td>
              <td>Vocab size</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>BioNLP<sup>c</sup> Lab PubMed + PMC<sup>d</sup> W2V</td>
              <td>Pyysalo et al 2013 [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]</td>
              <td>PubMed/  <break/>  
            PMC articles</td>
              <td>Token</td>
              <td>5</td>
              <td>Mixed case, no stop words, skip-grams</td>
              <td>word2Vec</td>
              <td>1 ngram</td>
              <td>200</td>
              <td>~4 billion tokens</td>
            </tr>
            <tr valign="top">
              <td>BioNLP LabWiki + PubMed + PMC W2V</td>
              <td>Pyysalo et al 2013 [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]</td>
              <td>Wikipedia, PubMed/  <break/>  
            PMC articles</td>
              <td>Token</td>
              <td>5</td>
              <td>Mixed case, no stop words, skip-grams</td>
              <td>word2Vec</td>
              <td>1 ngram</td>
              <td>200</td>
              <td>~5.4 billion tokens</td>
            </tr>
            <tr valign="top">
              <td>BioASQ</td>
              <td>Tsatsaronis et al 2015 [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]</td>
              <td>PubMed abstracts</td>
              <td>Token</td>
              <td>5</td>
              <td>Lowercase, no stop words, continuous bag of words</td>
              <td>word2Vec</td>
              <td>1 ngram</td>
              <td>200</td>
              <td>~1.7 billion tokens</td>
            </tr>
            <tr valign="top">
              <td>Clinical Embeddings W2V300</td>
              <td>Flamholz et al 2019 [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]</td>
              <td>PubMed/  <break/>  
            PMC/  <break/>  
            MIMIC III<sup>e</sup></td>
              <td>Token</td>
              <td>7</td>
              <td>Lowercase, include stop words, skip-grams</td>
              <td>word2Vec</td>
              <td>1-3 ngrams</td>
              <td>300</td>
              <td>~300k tokens</td>
            </tr>
            <tr valign="top">
              <td>BioWordVec Extrinsic</td>
              <td>Zhang et al 2019 [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]</td>
              <td>PubMed + MeSH<sup>f</sup></td>
              <td>Character</td>
              <td>5</td>
              <td>lowercase, include stop words</td>
              <td>FastText</td>
              <td>1-3 ngrams</td>
              <td>200</td>
              <td>~2.3 billion tokens</td>
            </tr>
            <tr valign="top">
              <td>BioWordVec Intrinsic</td>
              <td>Zhang et al 2019 [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]</td>
              <td>PubMed + MeSH</td>
              <td>Character</td>
              <td>20</td>
              <td>Lowercase, include stop words</td>
              <td>FastText</td>
              <td>1-3 ngrams</td>
              <td>200</td>
              <td>~2.3 million tokens</td>
            </tr>
            <tr valign="top">
              <td>Standard GloVe Embeddings</td>
              <td>Pennington et al 2014 [<xref ref-type="bibr" rid="ref11">11</xref>]</td>
              <td>Common Crawl</td>
              <td>Token</td>
              <td>10</td>
              <td>Mixed case</td>
              <td>GloVe</td>
              <td>1 ngram</td>
              <td>300</td>
              <td>~2.1 billion tokens</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>GloVe: global vectors for word representation.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>BioNLP: biomedical natural language processing.</p>
          </fn>
          <fn id="table1fn4">
            <p><sup>d</sup>PMC: PubMed Central.</p>
          </fn>
          <fn id="table1fn5">
            <p><sup>e</sup>MIMIC III: Medical Information Mart for Intensive Care III.</p>
          </fn>
          <fn id="table1fn6">
            <p><sup>f</sup>MeSH: Medical Subject Headings.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <sec>
        <title>Constructing the Reference Standard</title>
        <p>We generated a list of terms for COVID-19–related semantic categories of <italic>symptoms</italic> (“fever,” “high fever,” “cough,” “wet cough,” “dry cough,” “congestion,” “nasal congestion,” “pain,” “chest pain,” “muscle pain,” “shortness of breath,” “dyspnea,” “tachypnea,” “malaise,” “headache,” “sore throat”), <italic>findings</italic> (“hypoxia,” “opacities,” “bilateral opacities,” “infiltrates,” “lung infiltrates”), and <italic>disorders</italic> (“ARDS,” “respiratory distress,” “acute respiratory distress syndrome,” “pneumonia”) described in Cascella et al [<xref ref-type="bibr" rid="ref1">1</xref>]. We queried each word embedding source detailed in <xref ref-type="table" rid="table1">Table 1</xref> using these COVID-19–related phrases and retrieved the top 20 phrases based on ranked cosine similarity (terms closest to 1.0 signifying high similarity). Three annotators (a biomedical informatician, a clinical general internist and informatician, and a second-year medical student) encoded each returned phrase with the following semantic class types:</p>
        <list list-type="bullet">
          <list-item>
            <p><italic>Negation (black)</italic>: a negation of the query term (eg, “afebrile” is a negation of “fever”)</p>
          </list-item>
          <list-item>
            <p><italic>Synonyms (green)</italic>: a lexical variant of the query term with highly similar or synonymous meaning, including misspellings and short forms (eg, “ARDS” is a synonym for “Acute Respiratory Distress Syndrome”)</p>
          </list-item>
          <list-item>
            <p><italic>Symptom/signs (yellow)</italic>: any symptom, observation, finding, or syndrome that is not a synonym of the query term (eg, “fever” is a symptom returned by “cough”)</p>
          </list-item>
          <list-item>
            <p><italic>Disease/disorders (blue)</italic>: any disease, disorder, or diagnosis that is not a synonym for the query term (eg, “pneumonia” is a disorder returned by “dyspnea”)</p>
          </list-item>
          <list-item>
            <p><italic>Hyponym (light red)</italic>: a more specific semantic type of the query term (eg, “ground-glass opacities” is a hyponym of “opacities”)</p>
          </list-item>
          <list-item>
            <p><italic>Hypernym (dark red)</italic>: a broader semantic type of the query term (eg, “cough” is a hypernym of “productive cough”)</p>
          </list-item>
          <list-item>
            <p><italic>Qualifiers (teal)</italic>: any nonclinical temporal, spatial, quality, extent, or size descriptor (eg, “dry” is a qualifier for “cough”)</p>
          </list-item>
          <list-item>
            <p><italic>Anatomical location (orange)</italic>: any clinical anatomical or positional descriptor (eg, “lower lobe” is an anatomical location)</p>
          </list-item>
          <list-item>
            <p><italic>Therapeutic (purple)</italic>: any medication, therapy, or procedure (eg, “mechanical ventilation” is a therapeutic device)</p>
          </list-item>
          <list-item>
            <p><italic>Other (grey)</italic>: any semantic type that was not among the aforementioned or a nonclinical type (eg, “traffic” returned for “congestion”)</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Assessing Interannotator Agreement</title>
        <p>For each annotator pair, we computed the interannotator agreement for the semantic class types for each queried term using Cohen kappa [<xref ref-type="bibr" rid="ref37">37</xref>] using sklearn [<xref ref-type="bibr" rid="ref38">38</xref>]. Specifically, for each queried phrase (eg, “fever”), each annotator encoded the <italic>semantic type</italic> of the returned candidate term compared to the queried term (eg, returned term “pyrexia” encoded as a <italic>synonym</italic> for queried term “fever”). We report the overall interannotator agreement by category (symptom, finding, and disorder) and by queried term (“fever,” “dry cough”). We also depict semantic disagreements between each pair of annotators using heat maps generated using matplotlib [<xref ref-type="bibr" rid="ref39">39</xref>].</p>
      </sec>
      <sec>
        <title>Analyzing the Similarity Between COVID-19 Queried and Returned Terms</title>
        <p>We depict the broad range of terms returned across openly available word embedding sources. For each queried term, the returned term will maintain the same semantic type across word embedding sources but might return a different cosine similarity or occur in only select sources. Therefore, for all unique returned terms within the top 20 ranked by cosine similarity, we visualized the returned term based on its frequency among the word embedding sources at any rank using word clouds generated with matplotlib. The size of the word is a weighted representation of how frequently the returned term occurred across the seven-word embedding source; the score is bounded between 0.14 (observed within only one of seven word embedding sources) and 1.0 (observed within all seven word embedding sources). Additionally, of the terms that occurred <italic>at least once</italic> among the top 20 ranked terms across the seven embeddings, we plotted the range of cosine similarities. Observed top-ranked terms may have cosine similarity values ranging from 0 to 1.0. If a top-ranked term was not found within another embedding source, the term received a value of –1.</p>
      </sec>
      <sec>
        <title>Assessing the Semantic Distribution Patterns for Returned Candidate Terms by Source</title>
        <p>We determined the distribution of semantic classes among returned candidates for each queried term according to word embedding source. Our goal is to identify common semantic themes among the queried-returned term pairs that might be driven by the word embedding source construction. We performed a content analysis with simple mean comparisons for each semantic category as well as terms with and without modifiers across embedding sources to identify additional association patterns (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Queried terms (symptoms, findings, and disorders) with and without modifiers.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="390"/>
            <col width="0"/>
            <col width="580"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Category and term without modifier</td>
                <td>With modifier</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="4">
                  <bold>Symptoms</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>“fever”</td>
                <td colspan="2">“high fever”</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>“cough”</td>
                <td colspan="2">“wet cough,” “dry cough”</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>“congestion”</td>
                <td colspan="2">“nasal congestion”</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>“pain”</td>
                <td colspan="2">“chest pain,” “muscle pain”</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Findings</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>“opacities”</td>
                <td colspan="2">“bilateral opacities”</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>“infiltrates”</td>
                <td colspan="2">“lung infiltrates”</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Disorders</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>“ARDS”<sup>a</sup></td>
                <td colspan="2">“respiratory distress,” “acute respiratory distress syndrome”</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>ARDS: acute respiratory distress syndrome.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Generating Symptom Severity Profiles for Patients With Pneumonia, ARDS, and COVID-19</title>
        <p>As a proof of concept, we compared the proportion of patients that can be classified according to COVID-19 illness severity groups using terms indicative of their clinical features for three cohorts: patients with <italic>pneumonia</italic>, <italic>ARDS</italic>, and <italic>COVID-19</italic>. For the patients with pneumonia and ARDS cohorts, we queried all inpatient encounters and their resulting discharge summaries with COVID-19–related disorders: ARDS (International Classification of Diseases [ICD] codes: 518.5, 518.81, 518.82) and pneumonia (ICD codes: 480-488) from the MIMIC III database [<xref ref-type="bibr" rid="ref12">12</xref>]. For the patients with COVID-19 cohort, we queried all COVID-19 inpatient encounters from our EPIC PennChart COVID-19 registry from March 2020 to August 2020 and the resulting discharge summaries. In <xref ref-type="table" rid="table3">Table 3</xref>, we denote the clinical findings associated with COVID-19 respiratory illness severity categories [<xref ref-type="bibr" rid="ref1">1</xref>]. We applied the expanded lexicon for COVID-19 respiratory illness severity clinical features using synonyms detected from all embedding approaches (<italic>keywords + embedding expansion</italic>). For each cohort, we report the proportion of patients with the clinical feature documented within one or more discharge summaries.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Clinical findings according to the COVID-19 respiratory illness severity groups.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="480"/>
            <col width="520"/>
            <thead>
              <tr valign="top">
                <td>COVID-19 respiratory illness severity</td>
                <td>Clinical features</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Mild illness</td>
                <td>Mild fever, cough (dry), sore throat, malaise, headache, muscle pain, nasal congestion</td>
              </tr>
              <tr valign="top">
                <td>Moderate pneumonia</td>
                <td>Cough and shortness of breath</td>
              </tr>
              <tr valign="top">
                <td>Severe pneumonia/acute respiratory distress syndrome</td>
                <td>Fever is associated with severe dyspnea, respiratory distress, tachypnea, and hypoxia</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>We queried seven embedding sources with 15 symptom terms, five finding terms, and four disorder terms, resulting in 10,080 annotations (top 20 returned candidate terms × 25 queried terms × seven word embedding sources × three annotators).</p>
      <sec>
        <title>Assessing Interannotator Agreement</title>
        <p>We observed high overall pairwise interannotator agreement between annotators (ie, A#=Annotator#) for each semantic category: symptoms (0.86-0.99), findings (0.93-0.99), and disorders (0.93-0.99). For A1/A2 and A2/A3, we observed low to moderate interannotator agreement for “malaise” (0.40-0.41), “muscle pain” (0.6), “headache” (0.65-0.68), and “dry cough” (0.68). For A3/A1, interannotator agreement was consistently high (≥0.93). In <xref rid="figure1" ref-type="fig">Figure 1</xref>, we report the distribution of each queried term’s overall agreement between paired annotators. The color bar represents the third annotator pair. Overall agreement by COVID-19 category and by queried term for each annotator pair can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Pairwise interannotator agreement according to semantic category for each queried term.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>In <xref rid="figure2" ref-type="fig">Figures 2</xref>-<xref rid="figure4" ref-type="fig">4</xref>, for each returned term, we also computed interannotator agreement across semantic types. Across annotator pairs, we observed high interannotator agreement for all semantic types. Each heat map depicts systematic differences between annotators. In <xref rid="figure2" ref-type="fig">Figure 2</xref>, A1/A2 more often disagreed about whether a returned term was a hypernym, hyponym, or negation. In <xref rid="figure3" ref-type="fig">Figure 3</xref>, A2/A3 more often disagreed about whether a returned term was a synonym, disease or disorder, hypernym, hyponym, other, or negation. In <xref rid="figure4" ref-type="fig">Figure 4</xref>, A3/A1 most often disagreed about whether a returned term was a negation or other term.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>A1/A2 interannotator agreement of returned terms according to semantic type.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>A2/A3 interannotator agreement of returned terms according to semantic type.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>A1/A3 interannotator agreement of returned terms by semantic type.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Analyzing the Similarity for Returned Candidate Terms</title>
        <p>We report the broad range of queried terms returned across word embedding sources. For brevity, we depict three COVID-19–related concepts, one of each semantic category: <italic>symptom</italic> (“fever”; <xref rid="figure5" ref-type="fig">Figure 5</xref>), <italic>finding</italic> (“lung infiltrates”; <xref rid="figure6" ref-type="fig">Figure 6</xref>), and <italic>disorder</italic> (“acute respiratory distress syndrome”; <xref rid="figure7" ref-type="fig">Figure 7</xref>). For “fever,” synonyms (eg, “pyrexia,” “fevers,” and “febrile”) and signs or symptoms (eg, “chills” and “diarrhea”) were common among the returned terms. For “lung infiltrates,” the most frequent semantic types included anatomical locations (eg, “lungs,” and “peribronchial”) and hypernyms (eg, “infiltrate” and “infiltration”) were among the returned terms. For “ARDS,” disease or disorders (eg, “SARS” [severe acute respiratory syndrome] and “aSARS-CoV”), synonyms (eg, “ards” and “respiratory-distress-syndrome”), and hypernyms (eg, “syndromee” and “syndrome-critical” were observed commonly among the returned terms.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Word cloud depicting each returned term for “fever.” Colors correspond to semantic class types.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Word cloud depicting each returned term for “lung infiltrates.” Colors correspond to semantic class types.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Word cloud depicting each returned term for “acute respiratory distress syndrome.” Colors correspond to semantic class types. ALI: acute lung injury; ARDS: acute respiratory distress syndrome; ARI: acute respiratory infection; SARS: severe acute respiratory syndrome; SARS-CoV: severe acute respiratory syndrome–related coronavirus.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>In <xref rid="figure8" ref-type="fig">Figure 8</xref>, we observe that, given a queried term (eg, “fever,” “lung infiltrates,” and “acute respiratory distress syndrome”), returned terms differ by cosine similarity and variance. For example, some returned terms have high cosine similarity and low variability (left most in red and orange only), while others demonstrate variable cosine similarity and high variability (right most in all colors). Examples of returned terms with <italic>high cosine similarity and low variability</italic> include “fever”: “fevers,” “fevering,” and “pyrexia”; “lung infiltrates”: “infiltration,” “infiltrates,” and “peribronchial”; and “acute respiratory distress syndrome”: “syndrome(ARDS),” “aSARS,” and “syndromeards.” Examples of returned terms with <italic>variable cosine similarity and high variability</italic> include “fever”: “fevered,” “fever-based,” and “fever-like”; “lung infiltrates”: “infiltrational,” “consolidations,” and “bronchioepithelial”; and “acute respiratory distress syndrome”: “syndrome-is” and “syndrome-level.”</p>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Cosine similarity measures for each unique returned term among the top 20 terms across all word embedding sources returned for the queried terms “fever,” “lung infiltrates,” and “acute respiratory distress syndrome.” Color range indicative of cosine similarity level (0.0-1.0), not semantic type.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Assessing the Distribution of Semantic Types for Returned Candidate Terms by Source</title>
        <p>We determined the distribution of semantic classes among returned candidates for each queried term according to word embedding source. Our goal is to identify common semantic themes among the queried and returned candidate term pairs that might be driven by the word embedding source construction. We observed that the <italic>BioWordVec Extrinsic</italic> and <italic>BioWordVec Intrinsic</italic> embeddings (<xref rid="figure9" ref-type="fig">Figure 9</xref> e-f) were more likely to generate synonyms (green), which is notably depicted for “fever,” “headache,” “hypoxia,” “dyspnea,” and “infiltrates.” Word embedding sources generated based on characters tend to return more synonyms (mean count of 7.2 synonyms) compared to token-based embedding sources (mean count ranged from 2.04 to 3.4 synonyms). We also observed more negation terms for “hypoxia” (mean count of 2.29 negations); “congestion” (mean count of 1.57 negations); and “dry cough,” “wet cough,” and “tachypnea” (all had mean counts of 1.0 negations) compared to other terms (mean counts ranged from 0.00 to 0.71 negations). We observed a high mean count of hypernyms for “dry cough” (mean count of 6.43 hypernyms), “high fever” (mean count of 5.57 hypernyms), and “acute respiratory distress syndrome” (mean count of 4.43 hypernyms) over other terms (mean counts ranged from 0 to 3.29 hypernyms). Across the other word embeddings (<xref rid="figure9" ref-type="fig">Figure 9</xref> a-d and g), if a symptom or sign queried term was provided, we more often observed a symptom or sign returned term (mean average of 6.62 symptoms or signs) compared to nonsymptom or sign queried terms (mean average of 3.035 symptoms or signs). This also held true for disorders (mean average of 6.24 disorders) compared to nondisorders (mean average of 1.18 disorders). Across word embedding sources (<xref rid="figure9" ref-type="fig">Figure 9</xref>), we observed that qualifiers were more often returned when the queried term contained a qualifier for some terms (eg, “dry cough” and “wet cough” return time and consistency qualifiers like “wet” and “runny”; both mean counts of 4.14 qualified terms) over the nonqualified queried term “cough” (mean count of 1.71 qualified terms). Similar patterns were observed for “high fever” (mean count of 3.71 qualified terms), “fever” (mean count of 0.0 qualified terms), “bilateral opacities” (mean count of 6.14 qualified terms), and “opacities” (mean count of 2.71 qualified terms). Furthermore, if a queried term contained an anatomical location as an adjective in the term phrase (eg, “nasal congestion”), the returned terms were often anatomical locations compared to queried terms without adjectives. We observed notable differences in mean counts of returned terms with anatomical qualifiers for “nasal congestion” (mean count of 6.71 anatomical terms) and “congestion” (mean count of 0.42 anatomical terms), “chest pain” (mean count of 8.43 anatomical terms) and “pain” (mean count of 3.57 anatomical terms), and “lung infiltrates” (mean count of 10.57 anatomical terms) and “infiltrates” (mean count of 6.71 anatomical terms). In few cases, the <italic>standard GloVe embeddings</italic>, <italic>BioWordVec Extrinsic</italic>, and <italic>BioWordVec Intrinsic</italic> embeddings returned some terms with common term usage (eg, “congestion” returns “traffic,” “bypass,” or stop words such as “and,” “a,” and “of”).</p>
        <fig id="figure9" position="float">
          <label>Figure 9</label>
          <caption>
            <p>For each symptom, finding, and disorder queried term, the distribution of semantic types for returned term colored by semantic type for each embedding source: (a) BioNLP Lab PubMed + PMC W2V, (b) BioNLP LabWiki + PubMed + PMC W2V, (c) BioASQ, (d) Clinical Embeddings W2V300, (e) BioWordVec Extrinsic, (f) BioWordVec Intrinsic, and (g) Standard GloVe Embeddings. ARDS: acute respiratory distress syndrome.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Generating Symptom Severity Profiles for Patients With Pneumonia, ARDS, and COVID-19</title>
        <p><xref rid="figure10" ref-type="fig">Figure 10</xref> shows the proportion of patients from each disorder cohort (pneumonia, ARDS, and COVID-19) that have one or more terms documented within their discharge summary representing clinical features from <xref ref-type="table" rid="table3">Table 3</xref>. The total number of patients in each cohort varied: pneumonia (n=6410), ARDS (n=8647), and COVID-19 (n=2397). A higher proportion of patients had documented fever (0.61-0.84), cough (0.41-0.55), shortness of breath (0.40-0.59), and hypoxia (0.51-0.56) retrieved than other clinical features. Terms for dry cough returned a higher proportion of patients with COVID-19 (0.07) than pneumonia (0.05) and ARDS (0.03) populations.</p>
        <fig id="figure10" position="float">
          <label>Figure 10</label>
          <caption>
            <p>The proportion of patients with each COVID-19 clinical feature documented within their discharge summary according to disorders (pneumonia, ARDS, and COVID-19). ARDS: acute respiratory distress syndrome.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e21679_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Assessing Interannotator Agreement</title>
        <p>We observed high overall pairwise interannotator agreement for the symptoms, findings, and disorder categories. Annotators A1 and A3 were more often in agreement. For the A1/A2 and A2/A3 pairs, we observed low to moderate interannotator agreement for queried terms such as “malaise,” “muscle pain,” “headache,” and “dry cough.” Annotators A1 and A3 systematically classified notably fewer returned terms as hypernyms and hyponyms than A2. For example, “migraine” is a hypernym for “headache.” Additionally, A2 more easily identified negated terms through medical terminology. Many cases required more clinical domain knowledge to make these distinctions, which were easier for the general internist.</p>
      </sec>
      <sec>
        <title>Analyzing the Similarity Between COVID-19 Queried and Returned Terms</title>
        <p>When analyzing the cosine similarities between queried terms and returned terms, we observed that returned terms range from <italic>high cosine similarity and low variability</italic> to <italic>variable cosine similarity and high variability</italic>. We hypothesize that terms with high cosine similarity and low variability are more likely to be synonyms and useful for training an information extraction. In practice, the presence and cosine similarities of a term varied across word embedding sources. Our ability to identify and rank likely synonyms for lexicon development may be improved with additional processing steps and comparisons between the queried and returned terms for lexical similarity [<xref ref-type="bibr" rid="ref40">40</xref>], morphological derivation [<xref ref-type="bibr" rid="ref8">8</xref>], and short form construction and expansion [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
      </sec>
      <sec>
        <title>Assessing the Semantic Distribution Patterns for Returned Candidate Terms by Source</title>
        <p>We determined the distribution of semantic classes among returned candidates for each queried term according to the word embedding source. Our study intentions were to assess the distributional hypothesis that words with similar meanings are often used in similar contexts. Generally, if a symptom or sign queried term was provided, we often observed a symptom or sign returned term. This also held true for disorders. Furthermore, our goal was to identify common semantic themes among the queried and returned candidate term pairs that might be driven by the word embedding source construction. We observed that the <italic>BioWordVec Extrinsic</italic> and <italic>BioWordVec Intrinsic</italic> embeddings were more likely to generate synonyms. We hypothesize that this is likely due to training based on the characters rather than the token; thus, the returned terms often share a common set of characters (queried term: “<italic>fever</italic>”; returned term: “<italic>fever</italic>ish”) or high lexical similarity. Character-based embeddings will often return lexical variations of the queried term. Although <italic>BioNLP</italic>, <italic>BioASQ</italic>, and <italic>Clinical Embeddings</italic> generated fewer synonyms, these were often medical terms for the lay queried term (eg, “lethargy” for “malaise,” “cephalea” for “headache,” and “rhinorrhea” for “nasal congestion”). To maximize the diversity of learned synonyms, multiple embeddings could be most beneficial. Returned negated terms were expressed with prefixes (eg, “non-pneumonia-related”), suffixes (eg, “fever-free”), or medical terminology (eg, “normoxia”). Hypernyms were commonly observed among queried terms with an adjectival phrase (eg, “high fever,” “muscle pain,” “dry cough,” and “lung infiltrates”). Moreover, we observed that qualifiers were often returned when the queried term contained a qualifier (eg, time, consistency, and anatomical location qualifiers). For developing a clinical information extraction system, these returned terms can be useful for brainstorming synonyms as inclusionary terms as well as antonyms as exclusionary terms. We suspect that a mix of hypernyms and qualifiers were often returned, given the semantics of the individual parts of the queried phrase. It was not surprising that <italic>standard GloVe</italic> embeddings returned some terms with a nonclinical word sense (eg, “congestion” returns “traffic” or “bypass”) because they were trained using the CommonCrawl domain-independent corpora. Similarly, <italic>BioWordVec Extrinsic</italic> and <italic>BioWordVec Intrinsic</italic> occasionally return stop words, as these were not removed prior to training and perhaps should be for detecting meaningful synonyms.</p>
      </sec>
      <sec>
        <title>Generating Symptom Severity Profiles for Patients With Pneumonia, ARDS, and COVID-19</title>
        <p>We created an expanded lexicon of COVID-19 respiratory illness clinical features (<xref ref-type="table" rid="table3">Table 3</xref>) using synonyms detected from all embedding approaches. We assessed the proportion of patients from three disorder cohorts (pneumonia, ARDS, and COVID-19) with each clinical feature documented within their discharge summary. We observed that terms indicative of clinical features for fever, cough, shortness of breath, and hypoxia retrieved a higher proportion of patients than clinical features. For fever and cough, our lexicons for capturing contextualized mentions of these clinical features (eg, high fever or wet or dry cough) retrieved modest proportions of patient cases. This is likely due to the variability of qualitative and quantifications of these symptoms (eg, productive cough and fever of 102° F) in discharge summaries. Terms indicative of dry cough returned a higher proportion of patients with COVID-19 than pneumonia and ARDS populations. This is not surprising given that this is a prominent symptom reported among patients with COVID-19.</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>Our study has a few notable limitations. We began this study during the early stages of the COVID-19 pandemic when the symptomatology was less understood. COVID-19 is a heterogeneous disease with emerging symptomatology identified through ongoing clinical observational studies. Emerging COVID-19–related symptomatology (ie, loss of smell, loss of taste, and COVID toes) were not included in our analysis, as their association with COVID-19 were not well understood at the time of our study. We leveraged existing word embedding sources to better understand the utility of embeddings for synonym generation. We recognize that further experimentation is needed to support broader claims of their utility. As a proof of concept of patient information retrieval, we applied an expanded lexicon of terms representing clinical features of COVID-19 to three disorder cohorts (pneumonia, ARDS, and COVID-19). Although these terms retrieved a high proportion of patients, we acknowledge that additional terms might be necessary to accurately identify these features and that contextualization (ie, negation, severity, experiencer, and temporality [<xref ref-type="bibr" rid="ref42">42</xref>-<xref ref-type="bibr" rid="ref44">44</xref>]) is critical to generating accurate patient profiles. We look forward to addressing these issues as next steps within our clinical information extraction pipeline powered by Linguamatics [<xref ref-type="bibr" rid="ref45">45</xref>]. These text-derived and contextualized variables will be available through our clinical research databases—COVID-19 Informatics for Integrating Biology and the Bedside database [<xref ref-type="bibr" rid="ref46">46</xref>] and Penn Genotype and Phenotype database supported by the Observational Medical Outcomes Partnership common data model [<xref ref-type="bibr" rid="ref47">47</xref>]—at the end of Spring 2021.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>Word embeddings are a valuable technology for learning semantically and syntactically related terms including synonyms and useful for text classification and information extraction tasks. When leveraging openly available word embedding sources, choices made in the development of the embeddings can significantly influence the types of phrases and information learned.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Overall agreement by COVID-19 category and by queried term for each annotator pair.</p>
        <media xlink:href="medinform_v9i2e21679_app1.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ARDS</term>
          <def>
            <p>acute respiratory distress syndrome</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BioNLP</term>
          <def>
            <p>biomedical natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GloVe</term>
          <def>
            <p>global vectors for word representation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ICD</term>
          <def>
            <p>International Classification of Diseases</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">MIMIC III</term>
          <def>
            <p>Medical Information Mart for Intensive Care III</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">SARS</term>
          <def>
            <p>severe acute respiratory syndrome</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was approved by the University of Pennsylvania Institutional Review Board. We thank the University of Pennsylvania Perelman School of Medicine for the generous startup funds that supported this study as well as the partial support for DM through the University of Pennsylvania National Institutes of Health National Center for Advancing Translational Sciences Clinical and Translational Science Awards Award UL1-TR001878. We also thank the thoughtful reviewers for their invaluable feedback and the open-source community for the useful word embedding sources. The data files for this project can be found at our <italic>Semantic Analysis of Text to Inform Clinical Action</italic> laboratory GitHub page [<xref ref-type="bibr" rid="ref48">48</xref>].</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cascella</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rajnik</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cuomo</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dulebohn</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Di Napoli</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <source>Features, Evaluation, and Treatment of Coronavirus</source>
          <year>2020</year>
          <publisher-loc>Treasure Island, FL</publisher-loc>
          <publisher-name>StatPearls Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Barnett</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Boland</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Demiris</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Hernandez</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Herman</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Himes</surname>
              <given-names>BE</given-names>
            </name>
            <name name-style="western">
              <surname>Hubbard</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Mowery</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Ritchie</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Urbanowicz</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Holmes</surname>
              <given-names>JH</given-names>
            </name>
          </person-group>
          <article-title>Ideas for how informaticians can get involved with COVID-19 research</article-title>
          <source>BioData Min</source>
          <year>2020</year>
          <volume>13</volume>
          <fpage>3</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://biodatamining.biomedcentral.com/articles/10.1186/s13040-020-00213-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13040-020-00213-y</pub-id>
          <pub-id pub-id-type="medline">32419848</pub-id>
          <pub-id pub-id-type="pii">213</pub-id>
          <pub-id pub-id-type="pmcid">PMC7216865</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Turano</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Box</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A natural language processing system for national COVID-19 surveillance in the US Department of Veterans Affairs</article-title>
          <year>2020</year>
          <conf-name>1st Workshop on NLP for COVID-19 at ACL 2020</conf-name>
          <conf-date>July 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/2020.nlpcovid19-acl.10"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Castro</surname>
              <given-names>VM</given-names>
            </name>
            <name name-style="western">
              <surname>Dligach</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Finan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Can</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Abd-El-Barr</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gainer</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Shadick</surname>
              <given-names>NA</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Large-scale identification of patients with cerebral aneurysms using natural language processing</article-title>
          <source>Neurology</source>
          <year>2017</year>
          <month>01</month>
          <day>10</day>
          <volume>88</volume>
          <issue>2</issue>
          <fpage>164</fpage>
          <lpage>168</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27927935"/>
          </comment>
          <pub-id pub-id-type="doi">10.1212/WNL.0000000000003490</pub-id>
          <pub-id pub-id-type="medline">27927935</pub-id>
          <pub-id pub-id-type="pii">WNL.0000000000003490</pub-id>
          <pub-id pub-id-type="pmcid">PMC5224711</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Sioutos</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Crowley</surname>
              <given-names>RS</given-names>
            </name>
          </person-group>
          <article-title>Effectiveness of lexico-syntactic pattern matching for ontology enrichment with clinical documents</article-title>
          <source>Methods Inf Med</source>
          <year>2011</year>
          <volume>50</volume>
          <issue>5</issue>
          <fpage>397</fpage>
          <lpage>407</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21057720"/>
          </comment>
          <pub-id pub-id-type="doi">10.3414/ME10-01-0020</pub-id>
          <pub-id pub-id-type="medline">21057720</pub-id>
          <pub-id pub-id-type="pii">10-01-0020</pub-id>
          <pub-id pub-id-type="pmcid">PMC3125434</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>QT</given-names>
            </name>
            <name name-style="western">
              <surname>Redd</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rindflesch</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nebeker</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Synonym, topic model and predicate-based query expansion for retrieving clinical documents</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2012</year>
          <volume>2012</volume>
          <fpage>1050</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23304381"/>
          </comment>
          <pub-id pub-id-type="medline">23304381</pub-id>
          <pub-id pub-id-type="pmcid">PMC3540443</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Henriksson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Conway</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duneld</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>Identifying synonymy between SNOMED clinical terms of varying length using distributional analysis of electronic health records</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2013</year>
          <volume>2013</volume>
          <fpage>600</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24551362"/>
          </comment>
          <pub-id pub-id-type="medline">24551362</pub-id>
          <pub-id pub-id-type="pmcid">PMC3900203</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Velupillai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mowery</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Conway</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hurdle</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kious</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Vocabulary development to support information extraction of substance abuse from psychiatry notes</article-title>
          <year>2016</year>
          <conf-name>The 15th Workshop on Biomedical Natural Language Processing</conf-name>
          <conf-date>August 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w16-2912</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Burges</surname>
              <given-names>CJC</given-names>
            </name>
            <name name-style="western">
              <surname>Bottou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghahramani</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>KQ</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>NIPS'13: Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2</source>
          <year>2013</year>
          <publisher-loc>Red Hook, NY</publisher-loc>
          <publisher-name>Curran Associates Inc</publisher-name>
          <fpage>3111</fpage>
          <lpage>31119</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Enriching word vectors with subword information</article-title>
          <source>Trans Assoc Computational Linguistics</source>
          <year>2017</year>
          <month>12</month>
          <volume>5</volume>
          <fpage>135</fpage>
          <lpage>146</lpage>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>GloVe: global vectors for word representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2014</year>
          <conf-name>EMNLP '14</conf-name>
          <conf-date>October 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1532</fpage>
          <lpage>1543</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AEW</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <fpage>160035</fpage>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <article-title>BioASQ releases continuous space word vectors obtained by applying Word2Vec to PubMed abstracts</article-title>
          <source>BioASQ</source>
          <access-date>2020-04-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts">http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pyysalo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ginter</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Moen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Salakoski</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ananiadou</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Distributional semantics resources for biomedical text processing</article-title>
          <source>Proceedings of the 5th International Symposium on Languages in Biology and Medicine</source>
          <year>2013</year>
          <conf-name>LBM '13</conf-name>
          <conf-date>December 12-13, 2013</conf-date>
          <conf-loc>Tokyo, Japan</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>BioWordVec, improving biomedical word embeddings with subword information and MeSH</article-title>
          <source>Sci Data</source>
          <year>2019</year>
          <month>05</month>
          <day>10</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>52</fpage>
          <pub-id pub-id-type="doi">10.1038/s41597-019-0055-0</pub-id>
          <pub-id pub-id-type="medline">31076572</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-019-0055-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6510737</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Flamholz</surname>
              <given-names>ZN</given-names>
            </name>
            <name name-style="western">
              <surname>Ungar</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Weissman</surname>
              <given-names>GE</given-names>
            </name>
          </person-group>
          <article-title>Word embeddings trained on published case reports are lightweight, effective for clinical tasks, and free of protected health information</article-title>
          <source>medRxiv.</source>
          <comment>Preprint posted online December 4, 2019</comment>
          <pub-id pub-id-type="doi">10.1101/19013268</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McInnes</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Adam</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Melton</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Semantic similarity and relatedness between clinical terms: an experimental study</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2010</year>
          <month>11</month>
          <day>13</day>
          <volume>2010</volume>
          <fpage>572</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21347043"/>
          </comment>
          <pub-id pub-id-type="medline">21347043</pub-id>
          <pub-id pub-id-type="pmcid">PMC3041430</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>SVS</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McInnes</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Melton</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Ruggieri</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Towards a framework for developing semantic relatedness reference standards</article-title>
          <source>J Biomed Inform</source>
          <year>2011</year>
          <month>04</month>
          <volume>44</volume>
          <issue>2</issue>
          <fpage>251</fpage>
          <lpage>65</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(10)00156-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2010.10.004</pub-id>
          <pub-id pub-id-type="medline">21044697</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(10)00156-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC3063326</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>SVS</given-names>
            </name>
            <name name-style="western">
              <surname>Patwardhan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Measures of semantic similarity and relatedness in the biomedical domain</article-title>
          <source>J Biomed Inform</source>
          <year>2007</year>
          <month>06</month>
          <volume>40</volume>
          <issue>3</issue>
          <fpage>288</fpage>
          <lpage>99</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(06)00064-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2006.06.004</pub-id>
          <pub-id pub-id-type="medline">16875881</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(06)00064-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hliaoutakis</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Semantic similarity measures in MeSH ontology and their application to information retrieval on Medline</article-title>
          <source>InteLLigence</source>
          <year>2005</year>
          <access-date>2021-02-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.intelligence.tuc.gr/publications/Hliautakis.pdf">http://www.intelligence.tuc.gr/publications/Hliautakis.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <source>Ranker</source>
          <access-date>2020-04-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ranker.com">https://www.ranker.com</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dhingra</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Salakhutdinov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>A comparative study of word embeddings for reading comprehension</article-title>
          <source>arXiv. Posted on March 2, 2017</source>
          <year>2021</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1703.00993"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nishida</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Nishida</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Asano</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tomita</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Natural language inference with definition embedding considering context on the fly</article-title>
          <year>2018</year>
          <conf-name>The Third Workshop on Representation Learning for NLP</conf-name>
          <conf-date>July 2018</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w18-3007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mohd</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Text document summarization using word embedding</article-title>
          <source>Expert Syst Applications</source>
          <year>2020</year>
          <month>04</month>
          <volume>143</volume>
          <fpage>112958</fpage>
          <pub-id pub-id-type="doi">10.1016/j.eswa.2019.112958</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Topaz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Murga</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gaddis</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>MV</given-names>
            </name>
            <name name-style="western">
              <surname>Bar-Bachar</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Goldberg</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Bowles</surname>
              <given-names>KH</given-names>
            </name>
          </person-group>
          <article-title>Mining fall-related information in clinical notes: comparison of rule-based and novel word embedding-based machine learning approaches</article-title>
          <source>J Biomed Inform</source>
          <year>2019</year>
          <month>02</month>
          <volume>90</volume>
          <fpage>103103</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(19)30021-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2019.103103</pub-id>
          <pub-id pub-id-type="medline">30639392</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(19)30021-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chanda</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Egleston</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Vucetic</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>EHR phenotyping via jointly embedding medical concepts and words into a unified vector space</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2018</year>
          <month>12</month>
          <day>12</day>
          <volume>18</volume>
          <issue>Suppl 4</issue>
          <fpage>123</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-018-0672-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-018-0672-0</pub-id>
          <pub-id pub-id-type="medline">30537974</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-018-0672-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6290514</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dehmer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yli-Harja</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Emmert-Streib</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Combining deep learning with token selection for patient phenotyping from electronic health records</article-title>
          <source>Sci Rep</source>
          <year>2020</year>
          <month>01</month>
          <day>29</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>1432</fpage>
          <pub-id pub-id-type="doi">10.1038/s41598-020-58178-1</pub-id>
          <pub-id pub-id-type="medline">31996705</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-020-58178-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC6989657</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Sukul</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmoudi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Waljee</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Stansbury</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nallamothu</surname>
              <given-names>BK</given-names>
            </name>
          </person-group>
          <article-title>Predicting 30-day hospital readmissions using artificial neural networks with medical code embedding</article-title>
          <source>bioRxiv. </source>
          <comment>Preprint posted on August 20, 2019</comment>
          <pub-id pub-id-type="doi">10.1101/741504</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khattak</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>Jeblee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pou-Prom</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Abdalla</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Meaney</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rudzicz</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>A survey of word embeddings for clinical text</article-title>
          <source>J Biomed Inform</source>
          <year>2019</year>
          <month>12</month>
          <volume>4</volume>
          <fpage>100057</fpage>
          <pub-id pub-id-type="doi">10.1016/j.yjbinx.2019.100057</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Masino</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>CC</given-names>
            </name>
          </person-group>
          <article-title>A framework for developing and evaluating word embeddings of drug-named entity</article-title>
          <year>2018</year>
          <conf-name>The BioNLP 2018 Workshop</conf-name>
          <conf-date>July 2018</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w18-2319</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schild</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ling</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Blackburn</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stringhini</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zannettou</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>"Go eat a bat, Chang!": an early look on the emergence of sinophobic behavior on web communities in the face of COVID-19</article-title>
          <source>arXiv.</source>
          <comment>
               Preprint posted on April 8, 2020
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2004.04046"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Magge</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>O'Connor</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weissenbacher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Hernandez</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A chronological and geographical analysis of personal reports of COVID-19 on Twitter</article-title>
          <source>medRxiv.</source>
          <comment>Preprint posted on April 22, 2020</comment>
          <pub-id pub-id-type="doi">10.1101/2020.04.19.20069948</pub-id>
          <pub-id pub-id-type="medline">32511608</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <source>Biomedical Natural Language Processing</source>
          <access-date>2021-01-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bio.nlplab.org/">https://bio.nlplab.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tsatsaronis</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Balikas</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Malakasiotis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Partalas</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Zschunke</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alvers</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Weissenborn</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Krithara</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Petridis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Polychronopoulos</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Almirantis</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pavlopoulos</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Baskiotis</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gallinari</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Artiéres</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ngomo</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Heino</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gaussier</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Barrio-Alvers</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Schroeder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Androutsopoulos</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Paliouras</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>An overview of the BIOASQ large-scale biomedical semantic indexing and question answering competition</article-title>
          <source>BMC Bioinformatics</source>
          <year>2015</year>
          <month>04</month>
          <day>30</day>
          <volume>16</volume>
          <fpage>138</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0564-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-015-0564-6</pub-id>
          <pub-id pub-id-type="medline">25925131</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-015-0564-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC4450488</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weissman</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>gweissman / clinical_embeddings</article-title>
          <source>GitHub</source>
          <access-date>2021-01-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/gweissman/clinical_embeddings">https://github.com/gweissman/clinical_embeddings</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <article-title>ncbi-nlp / BioWordVec</article-title>
          <source>GitHub</source>
          <access-date>2021-01-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/ncbi-nlp/BioWordVec">https://github.com/ncbi-nlp/BioWordVec</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McHugh</surname>
              <given-names>ML</given-names>
            </name>
          </person-group>
          <article-title>Interrater reliability: the kappa statistic</article-title>
          <source>Biochem Med (Zagreb)</source>
          <year>2012</year>
          <volume>22</volume>
          <issue>3</issue>
          <fpage>276</fpage>
          <lpage>82</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.biochemia-medica.com/2012/22/276"/>
          </comment>
          <pub-id pub-id-type="medline">23092060</pub-id>
          <pub-id pub-id-type="pmcid">PMC3900052</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Prettenhofer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dubourg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vandeplas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Passos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Brucher</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perrot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duchesnay</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn: machine learning in Python</article-title>
          <source>J Machine Learning Res</source>
          <year>2011</year>
          <volume>12</volume>
          <fpage>2825</fpage>
          <lpage>2830</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hunter</surname>
              <given-names>JD</given-names>
            </name>
          </person-group>
          <article-title>Matplotlib: A 2D Graphics Environment</article-title>
          <source>Computing Sci Eng</source>
          <year>2007</year>
          <month>05</month>
          <volume>9</volume>
          <issue>3</issue>
          <fpage>90</fpage>
          <lpage>95</lpage>
          <pub-id pub-id-type="doi">10.1109/mcse.2007.55</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Hernandez</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>An unsupervised and customizable misspelling generator for mining noisy health-related text sources</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>12</month>
          <volume>88</volume>
          <fpage>98</fpage>
          <lpage>107</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(18)30216-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.11.007</pub-id>
          <pub-id pub-id-type="medline">30445220</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30216-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC6322919</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Henriksson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Moen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Skeppstedt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Daudaravičius</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Duneld</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Synonym extraction and abbreviation expansion with ensembles of semantic spaces</article-title>
          <source>J Biomed Semantics</source>
          <year>2014</year>
          <month>02</month>
          <day>05</day>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>6</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jbiomedsem.biomedcentral.com/articles/10.1186/2041-1480-5-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/2041-1480-5-6</pub-id>
          <pub-id pub-id-type="medline">24499679</pub-id>
          <pub-id pub-id-type="pii">2041-1480-5-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC3937097</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mowery</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wiebe</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Harkema</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dowling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>Semantic annotation of clinical events for generating a problem list</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2013</year>
          <volume>2013</volume>
          <fpage>1032</fpage>
          <lpage>41</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24551392"/>
          </comment>
          <pub-id pub-id-type="medline">24551392</pub-id>
          <pub-id pub-id-type="pmcid">PMC3900128</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Davoudi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mowery</surname>
              <given-names>DL</given-names>
            </name>
          </person-group>
          <article-title>Annotation and extraction of age and temporally-related events from clinical histories</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2020</year>
          <month>12</month>
          <day>30</day>
          <volume>20</volume>
          <issue>Suppl 11</issue>
          <fpage>338</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-01333-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-020-01333-5</pub-id>
          <pub-id pub-id-type="medline">33380319</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-020-01333-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC7772895</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mowery</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Kawamoto</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bradshaw</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kohlmann</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Schiffman</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Weir</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Borbolla</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Del Fiol</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Determining onset for familial breast and colorectal cancer from family history comments in the electronic health record</article-title>
          <source>AMIA Jt Summits Transl Sci Proc</source>
          <year>2019</year>
          <volume>2019</volume>
          <fpage>173</fpage>
          <lpage>181</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31258969"/>
          </comment>
          <pub-id pub-id-type="medline">31258969</pub-id>
          <pub-id pub-id-type="pmcid">PMC6568127</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <source>Linguamatics</source>
          <access-date>2021-01-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.linguamatics.com/">https://www.linguamatics.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="web">
          <source>i2b2</source>
          <access-date>2021-01-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.i2b2.org/software/index.html">https://www.i2b2.org/software/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="web">
          <article-title>OMOP common data model</article-title>
          <source>Observational Health Data Sciences and Informatics</source>
          <access-date>2021-01-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ohdsi.org/data-standardization/the-common-data-model/">https://www.ohdsi.org/data-standardization/the-common-data-model/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="web">
          <article-title>semantica-NLP / COVID-19_embeddings</article-title>
          <source>GitHub</source>
          <access-date>2021-02-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/semantica-NLP/COVID-19_embeddings">https://github.com/semantica-NLP/COVID-19_embeddings</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
