<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i11e22508</article-id>
      <article-id pub-id-type="pmid">33245284</article-id>
      <article-id pub-id-type="doi">10.2196/22508</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Identification of Semantically Similar Sentences in Clinical Notes: Iterative Intermediate Training Using Multi-Task Learning</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Wang</surname>
            <given-names>Yanshan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Verspoor</surname>
            <given-names>Karin</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Abeysinghe</surname>
            <given-names>Rashmie</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sun</surname>
            <given-names>Tien Lung</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Mahajan</surname>
            <given-names>Diwakar</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>IBM Research</institution>
            <addr-line>1101 Kitchawan Road</addr-line>
            <addr-line>Yorktown Heights, NY, 10598</addr-line>
            <country>United States</country>
            <phone>1 914 945 1614</phone>
            <email>dmahaja@us.ibm.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9791-2038</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Poddar</surname>
            <given-names>Ananya</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9846-3908</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Liang</surname>
            <given-names>Jennifer J</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5197-1590</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Yen-Ting</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2970-2455</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Prager</surname>
            <given-names>John M</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2976-9051</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Suryanarayanan</surname>
            <given-names>Parthasarathy</given-names>
          </name>
          <degrees>BTECH</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8619-2976</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Raghavan</surname>
            <given-names>Preethi</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2839-5847</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Tsou</surname>
            <given-names>Ching-Huei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1273-5904</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>IBM Research</institution>
        <addr-line>Yorktown Heights, NY</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>National Taiwan University</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Formerly IBM Research</institution>
        <addr-line>Yorktown Heights, NY</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Diwakar Mahajan <email>dmahaja@us.ibm.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>11</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>27</day>
        <month>11</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>11</issue>
      <elocation-id>e22508</elocation-id>
      <history>
        <date date-type="received">
          <day>31</day>
          <month>7</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>22</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>10</day>
          <month>10</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>13</day>
          <month>10</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Diwakar Mahajan, Ananya Poddar, Jennifer J Liang, Yen-Ting Lin, John M Prager, Parthasarathy Suryanarayanan, Preethi Raghavan, Ching-Huei Tsou. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 27.11.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2020/11/e22508/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Although electronic health records (EHRs) have been widely adopted in health care, effective use of EHR data is often limited because of redundant information in clinical notes introduced by the use of templates and copy-paste during note generation. Thus, it is imperative to develop solutions that can condense information while retaining its value. A step in this direction is measuring the semantic similarity between clinical text snippets. To address this problem, we participated in the 2019 National NLP Clinical Challenges (n2c2)/Open Health Natural Language Processing Consortium (OHNLP) clinical semantic textual similarity (ClinicalSTS) shared task.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to improve the performance and robustness of semantic textual similarity in the clinical domain by leveraging manually labeled data from related tasks and contextualized embeddings from pretrained transformer-based language models.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The ClinicalSTS data set consists of 1642 pairs of deidentified clinical text snippets annotated in a continuous scale of 0-5, indicating degrees of semantic similarity. We developed an iterative intermediate training approach using multi-task learning (IIT-MTL), a multi-task training approach that employs iterative data set selection. We applied this process to bidirectional encoder representations from transformers on clinical text mining (ClinicalBERT), a pretrained domain-specific transformer-based language model, and fine-tuned the resulting model on the target ClinicalSTS task. We incrementally ensembled the output from applying IIT-MTL on ClinicalBERT with the output of other language models (bidirectional encoder representations from transformers for biomedical text mining [BioBERT], multi-task deep neural networks [MT-DNN], and robustly optimized BERT approach [RoBERTa]) and handcrafted features using regression-based learning algorithms. On the basis of these experiments, we adopted the top-performing configurations as our official submissions.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Our system ranked first out of 87 submitted systems in the 2019 n2c2/OHNLP ClinicalSTS challenge, achieving state-of-the-art results with a Pearson correlation coefficient of 0.9010. This winning system was an ensembled model leveraging the output of IIT-MTL on ClinicalBERT with BioBERT, MT-DNN, and handcrafted medication features.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study demonstrates that IIT-MTL is an effective way to leverage annotated data from related tasks to improve performance on a target task with a limited data set. This contribution opens new avenues of exploration for optimized data set selection to generate more robust and universal contextual representations of text in the clinical domain.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>electronic health records</kwd>
        <kwd>semantic textual similarity</kwd>
        <kwd>natural language processing</kwd>
        <kwd>multi-task learning</kwd>
        <kwd>transfer learning</kwd>
        <kwd>deep learning</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>The wide adoption of electronic health records (EHRs) has led to clinical benefits with increased efficiency and financial benefits [<xref ref-type="bibr" rid="ref1">1</xref>]. Although electronic documentation has greatly improved the legibility and accessibility of clinical documentation, the use of templates and copy-paste during note generation has inadvertently introduced unnecessary, redundant, and potentially erroneous information (ie, note bloat), resulting in decreased readability and functional usability of the generated clinical notes [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. A previous study [<xref ref-type="bibr" rid="ref6">6</xref>] on 23,630 clinical notes identified that in a typical note, only 18% of the text was manually entered, whereas 46% was copied and 36% imported. This problem of note bloat not only increases physician cognitive burden [<xref ref-type="bibr" rid="ref7">7</xref>] but also becomes a challenge for the secondary use of EHRs in clinical informatics [<xref ref-type="bibr" rid="ref8">8</xref>]. <xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates this challenge with an example of 2 sample clinical notes from the same patient from consecutive visits; blue and yellow highlighted text indicate content that have been added or modified, respectively, whereas the plain unhighlighted text indicates information that is the same across clinical notes.</p>
        <p>One way to minimize data redundancy and highlight new information in unstructured clinical notes can be to compute the semantic similarity between clinical text snippets. This process of measuring the degree of semantic equivalence between clinical text snippets is known as clinical semantic textual similarity [<xref ref-type="bibr" rid="ref9">9</xref>]. As semantic textual similarity (STS) is a foundational language understanding problem, successful modeling of this task may help improve other higher-level applications in the clinical domain [<xref ref-type="bibr" rid="ref9">9</xref>], such as clinical question answering with evidence-based retrieval, clinical text summarization, semantic search, conversational systems, and clinical decision support.</p>
        <p>The 2019 National NLP Clinical Challenges (n2c2)/Open Health Natural Language Processing Consortium (OHNLP) track on clinical semantic textual similarity (ClinicalSTS) [<xref ref-type="bibr" rid="ref10">10</xref>] was organized to tackle this specific task: given a pair of clinical text snippets, assign a numerical score from 0 to 5 to indicate the degree of semantic similarity. This is an extension of a previous challenge from BioCreative/OHNLP 2018 ClinicalSTS [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>] that was inspired by the Semantic Evaluation (SemEval) semantic textual similarity (STS) shared tasks [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref18">18</xref>], which have been organized since 2012 in the general domain.</p>
        <p>Pretrained language models have been shown to be effective for achieving state-of-the-art results on many general and clinical domain natural language processing (NLP) tasks [<xref ref-type="bibr" rid="ref19">19</xref>], including STS. However, when the target domain differs substantially from the pretraining corpus, the contextualized embeddings may be ineffective for the target task. Furthermore, when the amount of training data are limited, as is common for clinical NLP tasks, fine-tuning experiments are potentially brittle and rely on the pretrained encoder parameters to be reasonably close to an ideal setting for the target task [<xref ref-type="bibr" rid="ref20">20</xref>]. A previous study has shown that small training data sets can significantly benefit from an intermediate training step [<xref ref-type="bibr" rid="ref20">20</xref>]. In a complementary work, multi-task learning (MTL) [<xref ref-type="bibr" rid="ref21">21</xref>] has been shown to be effective in leveraging supervised data from multiple related tasks for a target task. Furthermore, it has been observed that MTL and language model pretraining are complementary technologies [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>On the basis of these observations, we present a novel methodology that iteratively performs intermediate training of a pretrained language model in an MTL setup using related data-rich tasks. In this iterative process, related data sets were purposefully selected to induce representative knowledge of the target task. In addition, we evaluated the impact of combining multiple transformer-based language models pretrained on diverse corpora. Our system ranked first in the 2019 n2c2/OHNLP ClinicalSTS challenge, achieving state-of-the-art results.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Two sample clinical notes for the same patient from consecutive visits. Plain text indicates same content between 2 notes; italics (yellow highlight) indicate the content that has been modified, and bold (blue highlight) indicates new content in the second note.</p>
          </caption>
          <graphic xlink:href="medinform_v8i11e22508_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Relevant Literature</title>
        <p>STS is defined as the comparison of a pair of text snippets, approximately one sentence in length, resulting in a numerical score that takes a value on a continuous scale of 0 to 5, indicating degrees of semantic similarity [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. STS, along with paraphrase detection and textual entailment, is a form of semantic relatedness task. Paraphrase detection is the identification of sentences that are semantically identical [<xref ref-type="bibr" rid="ref22">22</xref>], whereas textual entailment is the task of reasoning if one text snippet can be inferred from another [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. STS is more similar to paraphrase detection because of the symmetricity of the relationship, as compared with entailment, which is asymmetric. However, unlike paraphrase detection, STS expands on the binary output scoring in paraphrase detection to capture gradations of relatedness.</p>
        <p>Early research on STS, in both the general and clinical domains, focused on lexical semantics, basic syntactic similarity, surface form matching, and alignment-based methods [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. The overarching theme behind these methods is the identification, alignment, and scoring of semantically related words and phrases and aggregating their scores. However, the absence of a principled way of combining the topological and semantic information led to the construction of sentence representations by building a linear composition of the distributed representations of individual words [<xref ref-type="bibr" rid="ref29">29</xref>-<xref ref-type="bibr" rid="ref32">32</xref>]. Although these techniques were an improvement over traditional approaches, they fell short as they did not take the surrounding context into account while generating distributed representations.</p>
        <p>Early attempts at building richer representations that encode several linguistic aspects of a sentence for computing similarity included paragraph vectors [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref36">36</xref>], word embedding weighting and principal component removal [<xref ref-type="bibr" rid="ref37">37</xref>], and convolutional deep structured semantic model [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. However, recent studies on pretrained language models have achieved a breakthrough in sentence representation learning [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. Bidirectional encoder representations from transformers (BERT) build upon the ideas from the transformer [<xref ref-type="bibr" rid="ref42">42</xref>] to construct rich sentence representations and has achieved state-of-the-art results on many general and clinical domain NLP tasks [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. In this process, a transformer-based model is first pretrained on large corpora to learn universal language representations and is then fine-tuned with a task-specific output layer for the target task. BERT has been adapted to biomedical (bidirectional encoder representations from transformers for biomedical text mining [BioBERT]) [<xref ref-type="bibr" rid="ref44">44</xref>] and clinical (bidirectional encoder representations from transformers on clinical text mining [ClinicalBERT]) domains [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>].</p>
        <p>The performance of BERT and its domain-specific variants could be further improved through MTL. MTL [<xref ref-type="bibr" rid="ref47">47</xref>] refers to training a model simultaneously for multiple related tasks, and MTL benefits from a regularization effect by alleviating overfitting to a specific task, thus making the learned representations universal across tasks. Supplementary training on intermediate tasks refers to the second stage of pretraining of a model, with data-rich intermediate supervised tasks. Recent studies, such as multi-task deep neural networks (MT-DNN) [<xref ref-type="bibr" rid="ref21">21</xref>] and supplementary training on intermediate labeled-data tasks [<xref ref-type="bibr" rid="ref20">20</xref>], show that the use of MTL and intermediate pretraining generates more robust and universal learned representations, resulting in better domain adaptation with fewer in-domain labels.</p>
        <p>The winning systems in ClinicalSTS 2018 challenge [<xref ref-type="bibr" rid="ref48">48</xref>] and SemEval 2017 [<xref ref-type="bibr" rid="ref49">49</xref>] built upon a combination of approaches referenced earlier in this section. In general, they employed ensembled feature engineering methods (random forest, gradient boosting, and XGBoost) with features based on n-gram overlap, edit distance, longest common prefix/suffix/substring, word alignments [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>], summarization and machine translation evaluation metrics, and deep learning [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]. In contrast to these systems, our study builds upon the modern neural approaches referenced earlier. Specifically, our system implements MTL and supplementary training on intermediate labeled tasks with ClinicalBERT to achieve state-of-the-art performance on the ClinicalSTS 2019 task. Following the demonstration of our system at the 2019 n2c2/OHNLP challenge presentation, additional systems leveraging MTL in ClinicalBERT [<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref54">54</xref>] have been implemented with promising results.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Set</title>
        <p>The 2019 ClinicalSTS data set was prepared by the n2c2/OHNLP challenge organizers from sentences collected from clinical notes in the Mayo Clinic’s clinical data warehouse. Candidate sentence pairs were then generated using an average value ≥0.45 of surface lexical similarity methods, namely, Ratcliff/Obershelp [<xref ref-type="bibr" rid="ref55">55</xref>], cosine similarity, and Levenshtein distance. This resulted in 2054 pairs, of which 1642 were released as the training set and the remaining 412 were held by the organizers for testing. Protected health information was removed using a mix of frequency filtering approach [<xref ref-type="bibr" rid="ref56">56</xref>] and manual review process. Each sentence pair was independently reviewed by 2 clinical experts and scored on a scale of 0 to 5 based on their semantic equivalence (0 for no semantic equivalence to 5 for complete semantic equivalence). Interannotator agreement was 0.6 based on weighted Cohen kappa. The averaged score between the 2 annotators was used as the gold standard. <xref ref-type="table" rid="table1">Table 1</xref> presents a few examples from the data set.</p>
        <p>We split the provided training data set of 1642 sentence pairs into 75.03% (1232/1642), 14.98% (246/1642), and 9.99% (164/1642) to form our train, validation, and internal test data sets, respectively.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Sample sentence pairs and annotations from the clinical semantic textual similarity data set.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="300"/>
            <col width="300"/>
            <col width="90"/>
            <col width="150"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Ground truth<sup>a</sup></td>
                <td>Score</td>
                <td colspan="2">Observations</td>
              </tr>
              <tr valign="top">
                <td>Sentence 1</td>
                <td>Sentence 2</td>
                <td>
                  <break/>
                </td>
                <td>Domain dependence</td>
                <td>Comments</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>“The patient was taken to the <italic>PACU</italic><sup>b</sup> in a stable condition.”</td>
                <td>“The patient was taken to the <italic>post anesthesia care unit</italic> postoperatively for recovery.”</td>
                <td>5.0</td>
                <td>Domain specific</td>
                <td>Clinical abbreviations</td>
              </tr>
              <tr valign="top">
                <td>“<italic>Albuterol [PROVENTIL/VENTOLIN] 90 mcg/Act HFA</italic><sup>c</sup> <italic>Aerosol 1-2 puffs by inhalation every 4 hours as needed</italic>.”</td>
                <td>“<italic>Ipratropium-Albuterol [COMBIVENT] 18-103 mcg/Actuation Aerosol 2 puffs by inhalation two times a day as needed</italic>”</td>
                <td>3.5</td>
                <td>Domain specific</td>
                <td>Medication instruction parsing</td>
              </tr>
              <tr valign="top">
                <td>“Cardiovascular assessment findings include <italic>heart rate normal, atrial fibrillation with controlled ventricular response</italic>.”</td>
                <td>“Cardiovascular assessment findings include <italic>heart rate, first degree AV</italic><sup>d</sup> <italic>Block</italic>.”</td>
                <td>3.0</td>
                <td>Domain specific</td>
                <td>Medical concept similarity and medical concept mapping</td>
              </tr>
              <tr valign="top">
                <td>“He was <italic>prepped and draped in the standard</italic> fashion.”</td>
                <td>“The affected shoulder was <italic>prepared and draped with the usual</italic> sterile technique.”</td>
                <td>3.0</td>
                <td>Domain independent</td>
                <td>Alignment</td>
              </tr>
              <tr valign="top">
                <td>“Musculoskeletal: <italic>Positive</italic> for gait problem, joint swelling and extremity pain.”</td>
                <td>“Musculoskeletal: <italic>Negative</italic> for back pain, myalgias and extremity pain.”</td>
                <td>1.5</td>
                <td>Domain independent</td>
                <td>Assertion classification (polarity)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Italics indicate the phrases within each sentence which correspond to the observations.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>PACU: post anesthesia care unit.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>HFA: hydrofluoroalkane.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>AV: atrioventricular.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Analysis of this data set revealed 2 characteristics that we consider in our approach to this task. First, the lack of sufficient training data makes it difficult to train robust machine learning models using only the given training data. Second, clinical semantic similarity relies on both domain-specific (eg, clinical abbreviation expansion, medical concept detection, and medical concept normalization) and domain-independent (eg, assertion classification and alignment detection) aspects, as demonstrated by the sample sentence pairs in <xref ref-type="table" rid="table1">Table 1</xref>. For the first sentence pair, a domain-specific understanding of PACU as an abbreviation for post anesthesia care unit is necessary to infer the high semantic equivalence. For the fourth sample sentence pair, domain-independent understanding of the difference in polarity between Positive and Negative is necessary to infer the low similarity equivalence.</p>
        <p>To address the lack of sufficient training data and leverage the domain-specific and domain-independent aspects of clinical semantic similarity, we propose an approach that combines the following:</p>
        <list list-type="bullet">
          <list-item>
            <p>an iterative intermediate multi-task training step for effective transfer learning employing other related annotated data sets</p>
          </list-item>
          <list-item>
            <p>an ensemble module that combines language models pretrained on both domain-specific and domain-independent data sets and also incorporates other features.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Iterative Intermediate Training Using MTL</title>
        <p>We performed iterative multi-task training on a transformer-based language model using annotated data sets from related tasks to induce representative knowledge of the target task. With each iteration, annotated data sets from related tasks were added or removed. Following data set selection, the language model was then trained using MTL on the selected data sets, fine-tuned on the target task, and its results were evaluated and error analysis was performed to determine the data set selection for the next iteration. We refer to this entire process as iterative intermediate training using multi-task learning (IIT-MTL).</p>
        <p>IIT-MTL is analogous to traditional feature-based machine learning methodologies, where performance evaluation and error analysis lead to feature selection used to train the model. In IIT-MTL, instead of feature selection, data set selection is employed to select data sets. <xref rid="figure2" ref-type="fig">Figure 2</xref> presents IIT-MTL compared with the traditional machine learning approach.</p>
        <p>For the ClinicalSTS task, ClinicalBERT was used as our base model as it was pretrained on a clinical corpus and provides clinically specific contextual embeddings most suited to our task. Through IIT-MTL, a refined clinical domain-specific language model, IIT-MTL on ClinicalBERT (IIT-MTL-ClinicalBERT), is obtained that has been iteratively tuned for high performance on the ClinicalSTS task.</p>
        <p>In the following sections, we present each step of IIT-MTL as applied to the ClinicalSTS task: (1) the data set selection process, including details of each iteration and data sets used; (2) the MTL architecture with the task-specific layers considered during the iterative process; and (3) fine-tuning on the target task.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Comparison of traditional machine learning approach (left), where performance evaluation and error analysis lead to feature selection, and our proposed iterative training using multi-task learning approach (right), where performance evaluation and error analysis lead to data set selection.</p>
          </caption>
          <graphic xlink:href="medinform_v8i11e22508_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Set Selection</title>
        <p>For effective performance on the target ClinicalSTS task, we not only trained our model using MTL as an intermediate step but also iteratively selected the data sets employed during this process based on error analysis of the performance on the target task. The selection of complementary data sets is critical to this process as it significantly impacts the contextual representations in the final model.</p>
        <p>Several publicly available data sets were considered in these iterations, including Semantic Textual Similarity Benchmark (STS-B) [<xref ref-type="bibr" rid="ref18">18</xref>], Recognizing Question Entailment (RQE) [<xref ref-type="bibr" rid="ref57">57</xref>], natural language inference data set for the clinical domain (MedNLI) [<xref ref-type="bibr" rid="ref24">24</xref>], and Quora Question Pairs (QQP) [<xref ref-type="bibr" rid="ref58">58</xref>]. STS-B consists of 8.6 K sentence pairs drawn from news headlines, video and image captions, and natural language inference data, each annotated with a score of 0 to 5 to indicate the degree of semantic equivalence. RQE consists of 8.9 K pairs of clinical questions, each annotated with a binary value to indicate entailment (or lack of) between the 2 questions. MedNLI consists of 14 K sentences extracted from clinical notes in the Medical Information Mart for Intensive Care (MIMIC-III) database [<xref ref-type="bibr" rid="ref59">59</xref>], with each sentence pair annotated as either entailment, neutral, or contradiction. QQP consists of 400 K pairs of questions extracted from the Quora question-and-answer website, each annotated with a binary value to indicate the similarity (or lack of) between the 2 questions. We created 2 additional data sets for use in IIT-MTL for ClinicalSTS: a sentence topic-based data set (Topic) and a medication named entity recognition data set (MedNER). Topic was created on sentences within the ClinicalSTS data set, where each sentence was manually annotated with a label from a predefined list of topics (eg, MED, SIGNORSYMPTOM, EXPLAIN, and OTHER). MedNER was autogenerated using a medication extraction tool [<xref ref-type="bibr" rid="ref60">60</xref>] on 1000 randomly selected clinical notes in the MIMIC-III database to recognize medications and its related artifacts (eg, strength, form, frequency, route, dosage, and duration). A summary of all data sets used is presented in <xref ref-type="table" rid="table2">Table 2</xref>, with additional details provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref59">59</xref>-<xref ref-type="bibr" rid="ref62">62</xref>].</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Data sets used in multi-task learning.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="110"/>
            <col width="150"/>
            <col width="130"/>
            <col width="130"/>
            <col width="480"/>
            <thead>
              <tr valign="top">
                <td>Data set</td>
                <td>Task</td>
                <td>Domain</td>
                <td>Size</td>
                <td>Example</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>STS-B<sup>a</sup></td>
                <td>Sentence pair similarity</td>
                <td>General</td>
                <td>8600</td>
                <td>Sentence 1: “A young child is riding a horse”; Sentence 2: “A child is riding a horse”; Similarity: 4.75</td>
              </tr>
              <tr valign="top">
                <td>RQE<sup>b</sup></td>
                <td>Sentence pair classification</td>
                <td>Biomedical</td>
                <td>8900</td>
                <td>Sentence 1: “Doctor X thinks he is probably just a normal 18 month old but would like to know if there are a certain number of respiratory infections that are considered normal for that age”; Sentence 2: “Probably a normal 18 month old but how many respiratory infections are normal”; Ground truth: entailment</td>
              </tr>
              <tr valign="top">
                <td>MedNLI<sup>c</sup></td>
                <td>Sentence pair classification</td>
                <td>Clinical</td>
                <td>14,000</td>
                <td>Sentence 1: “Labs were notable for Cr 1.7 (baseline 0.5 per old records) and lactate 2.4”; Sentence 2: “Patient has normal Cr”; Ground truth: contradiction</td>
              </tr>
              <tr valign="top">
                <td>QQP<sup>d</sup></td>
                <td>Sentence pair classification</td>
                <td>General</td>
                <td>400,000</td>
                <td>Sentence 1: “Why do rockets look white?”; Sentence 2: “Why are rockets and boosters painted white?”; Ground truth: 1</td>
              </tr>
              <tr valign="top">
                <td>Topic</td>
                <td>Sentence classification</td>
                <td>Clinical</td>
                <td>1,300,000</td>
                <td>Sentence: “Negative for difficulty urinating, pain with urination, and frequent urination”; Ground truth: SIGNORSYMPTOM</td>
              </tr>
              <tr valign="top">
                <td>MedNER<sup>e</sup></td>
                <td>Token-wise classification</td>
                <td>Clinical</td>
                <td>15,000</td>
                <td>Sentence: “he developed respiratory distress on the AM<sup>f</sup> of admission, cough day PTA<sup>g</sup>, CXR<sup>h</sup> with B/L<sup>i</sup> LL<sup>j</sup> PNA<sup>k</sup>, started ciprofloxacin and levofloxacin”; Ground truth: ciprofloxacin [DRUG] levofloxacin [DRUG]</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>STS-B: semantic textual similarity benchmark.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>RQE: Recognizing Question Entailment.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>MedNLI: natural language inference data set for the clinical domain.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>QQP: Quora Question Pairs.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>MedNER: medication named entity recognition.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>AM: morning.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>PTA: prior to admission.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>CXR: chest x-ray.</p>
            </fn>
            <fn id="table2fn9">
              <p><sup>i</sup>B/L: bilateral.</p>
            </fn>
            <fn id="table2fn10">
              <p><sup>j</sup>LL: left lower.</p>
            </fn>
            <fn id="table2fn11">
              <p><sup>k</sup>PNA: pneumonia.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>We established 2 baselines by fine-tuning 2 pretrained language models, BERT and ClinicalBERT, on the target ClinicalSTS task. Using the stronger baseline of ClinicalBERT, a total of 5 iterations were performed in IIT-MTL for the ClinicalSTS task. The selection of data sets for each iteration was decided based on our understanding of the ClinicalSTS task and error analysis of the results of the previous iteration. The data set selection for each iteration is detailed as follows. For each iteration, D indicates the set of data sets used for multi-task training, following which the model is further fine-tuned to the target ClinicalSTS task and evaluated before the next iteration.</p>
        <list list-type="bullet">
          <list-item>
            <p><italic>Iteration 1: D={STS-B}</italic>: STS-B was employed for multi-task training because it conforms to the same task (STS) in the general domain.</p>
          </list-item>
          <list-item>
            <p><italic>Iteration 2: D={STS-B, RQE, MedNLI}</italic>: Next, we added RQE and MedNLI, which are sentence pair classification tasks in the clinical domain, and, hence, are similar to our target task from a domain perspective.</p>
          </list-item>
          <list-item>
            <p><italic>Iteration 3: D={STS-B, RQE, MedNLI, Topic}</italic>: Analysis of the output from iteration 2 showed that sentence pairs on different topics within ClinicalSTS express similarity in different ways. Thus, we created and added the Topic data set.</p>
          </list-item>
          <list-item>
            <p><italic>Iteration 4: D={STS-B, RQE, MedNLI, Topic, MedNER}</italic>: Analysis of the output from iteration 3 showed that medication instruction sentences (eg, “Tylenol tablet 2 tablets by mouth as needed.”) were the worst performing sentence pairs. To induce medication-related knowledge, we created and added the MedNER data set to the mix.</p>
          </list-item>
          <list-item>
            <p><italic>Iteration 5</italic>: D=<italic>{STS-B, RQE, MedNLI, Topic, MedNER, QQP}</italic>: QQP was added in our final iteration as it is a sentence pair classification task, although in the general domain.</p>
          </list-item>
        </list>
        <p>The final set of data sets used in the model for the ClinicalSTS task (IIT-MTL-ClinicalBERT) was determined based on the performance analysis of each iteration.</p>
      </sec>
      <sec>
        <title>Intermediate MTL Architecture</title>
        <p>The architecture of our intermediate MTL setup is shown in <xref rid="figure3" ref-type="fig">Figure 3</xref> and is based on the process specified in the study by Liu et al [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Intermediate multi-task learning and fine-tuning architecture. ClinicalSTS: clinical semantic textual similarity; STS-B: semantic textual similarity benchmark; RQE: recognizing question entailment; MedNLI: natural language inference data set for the clinical domain; QQP: Quora question pairs; MedNER: medication named entity recognition data set; ClinicalBERT: bidirectional encoder representations from transformers on clinical text mining.</p>
          </caption>
          <graphic xlink:href="medinform_v8i11e22508_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The lower shared layers are based on BERT-base architecture [<xref ref-type="bibr" rid="ref19">19</xref>], whereas the higher segregated layers represent task-specific outputs. The task-specific layers correspond to the data sets selected during the data set selection.</p>
        <p>The input can either be a single sentence (X) or a pair of sentences (X<sub>1</sub>, X<sub>2</sub>) delimited with the separating token ([SEP]). All input texts are tokenized using WordPieces [<xref ref-type="bibr" rid="ref63">63</xref>] and truncated to spans no longer than 512 tokens. Following this, tokens are added to the start ([CLS]) and end ([SEP]) of the input. In the shared layers, a lexicon encoder converts the input into a sequence of input embedding vectors, one for each token. Next, a transformer encoder captures the contextual information and generates a sequence of contextual embeddings. This semantic representation is shared across all tasks and feeds into multiple lightweight task-specific architectures, each implementing a different task objective. In the training phase, we fine-tuned the shared layers along with task-specific layers using the multi-task objectives, detailed below:</p>
        <list list-type="bullet">
          <list-item>
            <p><italic>Sentence Pair Similarity</italic>: Suppose h<sub>[CLS]</sub> is the contextual embedding of [CLS] for input sentence pair (X<sub>1</sub>, X<sub>2</sub>) and w<sub>SPS</sub> is a task-specific parameter vector. We utilized a fully connected layer to compute the similarity score 
            <inline-graphic xlink:href="medinform_v8i11e22508_fig5.png" mimetype="image" xlink:type="simple"/>

            
            , where <inline-graphic xlink:href="medinform_v8i11e22508_fig6.png" mimetype="image" xlink:type="simple"/>
            
            is a real value of range (−∞, ∞). We use the mean squared error as the objective function:</p>
          </list-item>
        </list>
        <disp-formula>
          <graphic xlink:href="medinform_v8i11e22508_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>     where y is the similarity score for the sentence pair.</p>
        <list list-type="bullet">
          <list-item>
            <p><italic>Single Sentence Classification</italic>: Suppose h<sub>[CLS]</sub> is the contextual embedding of [CLS] for input sentence X and w<sub>SSC</sub> is a task-specific parameter vector. The probability that X is labeled as class c is predicted by logistic regression with softmax:</p>
          </list-item>
        </list>
        <disp-formula>
          <inline-graphic xlink:href="medinform_v8i11e22508_fig8.png" mimetype="image" xlink:type="simple"/>
        </disp-formula>
        <p>     This task is trained using the cross-entropy loss as the objective:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v8i11e22508_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>     where <inline-graphic xlink:href="medinform_v8i11e22508_fig13.png" mimetype="image" xlink:type="simple"/> is the binary indicator (0 or 1) if the class label c is the correct classification for X.</p>
        <list list-type="bullet">
          <list-item>
            <p><italic>Sentence Pair Classification</italic>: Suppose h<sub>[CLS]</sub> is the contextual embedding of [CLS] for sentence pair (X<sub>1</sub>, X<sub>2</sub>) and w<sub>SPC</sub> is a task-specific parameter vector. As the two sentences are packed together, we can predict that the relation R between X<sub>1</sub> and X<sub>2</sub> is given as 
          <inline-graphic xlink:href="medinform_v8i11e22508_fig10.png" mimetype="image" xlink:type="simple"/>
            
            similar to single sentence classification. We trained the task using the cross-entropy loss as specified previously</p>
          </list-item>
        </list>
        <list list-type="bullet">
          <list-item>
            <p><italic>Token Classification</italic>: Suppose h<sub>[1:n]</sub> is the contextual embedding for tokens Tok <sub>[1:n]</sub> in packed sentence pair (X<sub>1</sub>, X<sub>2</sub>) and w<sub>TC</sub> is a task-specific parameter vector. The token classification is trained using a per-entity linear classifier, where the probability that Tok<sub>[j]</sub> labeled as class c is predicted by logistic regression with softmax: <inline-graphic xlink:href="medinform_v8i11e22508_fig11.png" mimetype="image" xlink:type="simple"/>. Here, <inline-graphic xlink:href="medinform_v8i11e22508_fig12.png" mimetype="image" xlink:type="simple"/>. This task is trained using the cross-entropy loss as specified previously.</p>
          </list-item>
        </list>
        <p>The process for training our intermediate MTL architecture is demonstrated in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>. We initialized the shared layers of our architecture with the parameters of the pretrained ClinicalBERT [<xref ref-type="bibr" rid="ref46">46</xref>]. The task-specific layers were randomly initialized. We jointly refer to them as θ. Next, we created equal-sized subsamples (mini-batches) from each data set. For every epoch, a mini-batch b<sub>t</sub> was selected (from each of the MTL data sets detailed previously), and the model was updated according to the task-specific objective for task t. We used the mini-batch–based stochastic gradient descent to update the parameters. A detailed explanation of the training parameters is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref63">63</xref>-<xref ref-type="bibr" rid="ref65">65</xref>].</p>
        <boxed-text id="box1" position="float">
          <title>Multi-task learning algorithm.</title>
          <p>Initialize model parameters <italic>θ</italic></p>
          <p>Create E by merging mini-batches (b<sub>t</sub>) for each data set in D</p>
          <p>for epoch in 1,2,….., epoch<sub>max</sub> do</p>
          <p>     Shuffle E</p>
          <p>     for b<sub>t</sub> in E do</p>
          <p>          Compute loss: <italic>L (θ)</italic> based on task <italic>t</italic>;</p>
          <p>          Compute gradient: <italic>∇(θ)</italic></p>
          <p>          Update model: <italic>θ=θ−η∇(θ)</italic></p>
          <p>     end</p>
          <p>end</p>
        </boxed-text>
      </sec>
      <sec>
        <title>Fine-Tuning</title>
        <p>After multi-task training, we fine-tuned the model on the target ClinicalSTS task. As ClinicalSTS is a sentence similarity task, we fine-tuned the sentence pair similarity task-specific layer of the multi-task architecture (<xref rid="figure3" ref-type="fig">Figure 3</xref>) to train the model using the ClinicalSTS data set. The predictions on the internal test data set were evaluated, which drove the data set selection process. A detailed explanation of the training parameters is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
      </sec>
      <sec>
        <title>Ensemble Module</title>
        <p>To induce both domain-specific and domain-independent aspects of clinical semantic similarity, we leveraged other pretrained language models in addition to IIT-MTL-ClinicalBERT in the ensemble module. During this process, we fine-tuned other pretrained language models on the target task, ensembled their predictions with predictions from IIT-MTL-ClinicalBERT (which was already fine-tuned during IIT-MTL), and then incorporated additional similarity features. In the following sections, we describe the (1) language models used, (2) additional similarity features incorporated, and (3) different ensembling techniques explored.</p>
      </sec>
      <sec>
        <title>Language Models</title>
        <p>A total of 4 language models were used in our ensemble module: IIT-MTL-ClinicalBERT, BioBERT [<xref ref-type="bibr" rid="ref44">44</xref>], MT-DNN [<xref ref-type="bibr" rid="ref21">21</xref>], and robustly optimized BERT approach (RoBERTa) [<xref ref-type="bibr" rid="ref66">66</xref>]. IIT-MTL-ClinicalBERT, the output of IIT-MTL, was derived from ClinicalBERT [<xref ref-type="bibr" rid="ref46">46</xref>], and therefore, it provided clinical domain-specific contextual embeddings. To provide contextual representations from a similar but slightly different domain, we used BioBERT, which is also BERT-based but has been further pretrained on the biomedical corpus. To account for the domain-independent aspects of clinical semantic similarity, we used language models from the general domain, specifically RoBERTa and MT-DNN. RoBERTa is based on BERT but has been optimized for better performance, whereas MT-DNN leverages large amounts of cross-task data, resulting in more generalized and robust text representations. We selected RoBERTa and MT-DNN for use in our ensemble module because at the time of the 2019 n2c2/OHNLP challenge, they achieved state-of-the-art results on multiple tasks similar to ClinicalSTS, including STS-B [<xref ref-type="bibr" rid="ref43">43</xref>], Multi-Genre Natural Language Inference [<xref ref-type="bibr" rid="ref23">23</xref>], Question answering Natural Language Inferencing [<xref ref-type="bibr" rid="ref67">67</xref>], and Recognizing Textual Entailment [<xref ref-type="bibr" rid="ref68">68</xref>]. <xref ref-type="table" rid="table3">Table 3</xref> presents an overview of the language models used in our experiments.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Pretrained language models used in the ensemble module and their training corpora.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="254"/>
            <col width="621"/>
            <col width="125"/>
            <thead>
              <tr valign="top">
                <td>Language model</td>
                <td>Corpora for language model pretraining</td>
                <td>Domain</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>MT-DNN<sup>a</sup></td>
                <td>Wikipedia+BookCorpus</td>
                <td>General</td>
              </tr>
              <tr valign="top">
                <td>RoBERTa<sup>b</sup></td>
                <td>Wikipedia+BookCorpus+CC-News+OpenWebText+Stories</td>
                <td>General</td>
              </tr>
              <tr valign="top">
                <td>BioBERT<sup>c</sup></td>
                <td>Wikipedia+BookCorpus+PubMed+PMC<sup>d</sup></td>
                <td>Biomedical</td>
              </tr>
              <tr valign="top">
                <td>IIT-MTL-ClinicalBERT<sup>e</sup></td>
                <td>Wikipedia+BookCorpus+MIMIC-III<sup>f</sup></td>
                <td>Clinical</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>MT-DNN: multi-task deep neural networks.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>RoBERTa: robustly optimized bidirectional encoder representations from transformers approach.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>BioBERT: bidirectional encoder representations from transformers for biomedical text mining.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>PMC: PubMed Central</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>IIT-MTL-ClinicalBERT: iteratively trained using multi-task learning on ClinicalBERT.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>MIMIC-III: Medical Information Mart for Intensive Care.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Other Similarity Features</title>
        <p>Under the hypothesis that aggregating similarity metrics from different perspectives could help further boost performance, we incorporated additional string similarity features to our ensembled model. On the basis of the observation that medication instructions appear frequently in our data set, we incorporated medication features by (1) using a medication information extraction system [<xref ref-type="bibr" rid="ref69">69</xref>] to extract medications and its related attributes (eg, drug name, dosage, duration, form, frequency, route, and strength) from the text and (2) converting the extracted attributes into composite features. We also incorporated additional features shown to be useful in the previous 2018 ClinicalSTS challenge, including domain-specific features and phrasal similarity features. Details on these features are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref69">69</xref>-<xref ref-type="bibr" rid="ref71">71</xref>].</p>
      </sec>
      <sec>
        <title>Ensemble Methods</title>
        <p>A total of 3 learning algorithms for regression were used for ensembling language model outputs and features: linear regression, Bayesian regression, and ridge regression. Note that we also explored random forest and XGBoost, which were used in the previous year’s winning systems, but found that they underperformed, and therefore, we did not use those methods. On the basis of the performance on the internal test data set, we experimented with incrementally averaging different combinations of the constituent model outputs while adding the other similarity features previously described. A detailed explanation of the training parameters is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        <p><xref rid="figure4" ref-type="fig">Figure 4</xref> presents an overview of our end-to-end system on the ClinicalSTS task, consisting of an iterative intermediate multi-task training step followed by an ensemble module. Note that the intermediate MTL and fine-tuning portion of <xref rid="figure4" ref-type="fig">Figure 4</xref> was presented earlier in more detail in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Overview of our end-to-end system. ClinicalBERT: bidirectional encoder representations from transformers on clinical text; IIT-MTL-ClinicalBERT: iterative intermediate training using multi-task learning on ClinicalBERT; MT-DNN: multi-task deep neural networks; RoBERTa: robustly optimized BERT approach; BioBERT: bidirectional encoder representations from transformers for biomedical text mining.</p>
          </caption>
          <graphic xlink:href="medinform_v8i11e22508_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Evaluation Metrics</title>
        <p>We evaluated the proposed system using the evaluation script released by the organizers of the 2019 n2c2/OHNLP challenge to measure the Pearson correlation coefficient (PCC) between the human-annotated (gold standard) and predicted clinical semantic similarity scores. In the Results section, we report the PCC on the internal test data set for each iteration in IIT-MTL as well as on each combination of language models tried during ensembling. We also report the PCC for our 3 official submissions to the 2019 n2c2/OHNLP challenge on both the internal test data set and withheld external test data set.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Iterative Intermediate Training Using MTL</title>
        <p><xref ref-type="table" rid="table4">Table 4</xref> presents the results of each iteration in IIT-MTL. In comparison with the ClinicalBERT baseline, the addition of complementary data sets improved the overall model performance. Notably, not all data set additions resulted in improved performance. This is highlighted in iteration 5, where the addition of QQP led to a significant drop in performance. As the model from iteration 4 showed the best performance on the internal test data set, we adopted this variant for the final IIT-MTL-ClinicalBERT model.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Results of each iteration of iterative intermediate training using multi-task learning.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="31"/>
            <col width="179"/>
            <col width="89"/>
            <col width="89"/>
            <col width="125"/>
            <col width="82"/>
            <col width="132"/>
            <col width="89"/>
            <col width="184"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Experiment and language model</td>
                <td colspan="6">Data sets used for iterative intermediate training approach using multi-task learning</td>
                <td>Pearson correlation coefficient on internal test</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>STS-B<sup>a</sup></td>
                <td>RQE<sup>b</sup></td>
                <td>MedNLI<sup>c</sup></td>
                <td>Topic</td>
                <td>MedNER<sup>d</sup></td>
                <td>QQP<sup>e</sup></td>
                <td>
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="9">
                  <bold>BL<sup>f</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1 BERT<sup>g</sup></td>
                <td>—<sup>h</sup></td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>0.834</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2 ClinicalBERT<sup>i</sup></td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>0.848</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Iter<sup>j</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1 ClinicalBERT</td>
                <td>✓<sup>k</sup></td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>0.852</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2 ClinicalBERT</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>0.862</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>3 ClinicalBERT</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>—</td>
                <td>—</td>
                <td>0.866</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>4 ClinicalBERT</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>—</td>
                <td>
                  <italic>0.870</italic>
                  <sup>l</sup>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>5 ClinicalBERT</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>0.856</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>STS-B: semantic textual similarity benchmark.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>RQE: Recognizing Question Entailment.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>MedNLI: Natural Language Inference data set for the clinical domain.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>MedNER: Medication-NER data set.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>QQP: Quora Question Pair data set.</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>BL: baseline.</p>
            </fn>
            <fn id="table4fn7">
              <p><sup>g</sup>BERT: bidirectional encoder representations from transformers.</p>
            </fn>
            <fn id="table4fn8">
              <p><sup>h</sup>Indicates data set was not used for this experiment.</p>
            </fn>
            <fn id="table4fn9">
              <p><sup>i</sup>ClinicalBERT: bidirectional encoder representations from transformers on clinical text mining.</p>
            </fn>
            <fn id="table4fn10">
              <p><sup>j</sup>Iter: iteration.</p>
            </fn>
            <fn id="table4fn11">
              <p><sup>k</sup>Indicates data sets that were trained together in multi-task learning.</p>
            </fn>
            <fn id="table4fn12">
              <p><sup>l</sup>Italics signify highest Pearson correlation coefficient obtained on internal test data set.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ensemble Module</title>
        <p><xref ref-type="table" rid="table5">Table 5</xref> presents the results of the language model ensemble experiments performed on the internal test data set. Here, the statistical mean of the normalized language model outputs was used as our ensemble method. Of the individual models, IIT-MTL-ClinicalBERT and BioBERT, which were pretrained on clinical and biomedical corpora, respectively, achieved higher PCC as compared with MT-DNN and RoBERTa, which were pretrained only on general domain corpora. In general, ensembled models performed better than the individual constituent models alone, with the combination of IIT-MTL-ClinicalBERT, BioBERT, and MT-DNN resulting in the highest performance (PCC 0.8809) on the internal test data set.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Ablation study of language models utilized in the ensemble module. The statistical mean of the language model outputs was used as the ensembling method.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="138"/>
            <col width="223"/>
            <col width="130"/>
            <col width="93"/>
            <col width="134"/>
            <col width="282"/>
            <thead>
              <tr valign="top">
                <td>Experiment</td>
                <td colspan="4">Language model ensemble</td>
                <td>Pearson correlation coefficient on internal test</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>IIT-MTL-ClinicalBERT<sup>a</sup></td>
                <td>BioBERT<sup>b</sup></td>
                <td>MT-DNN<sup>c</sup></td>
                <td>RoBERTa<sup>d</sup></td>
                <td>
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>✓<sup>e</sup></td>
                <td>—<sup>f</sup></td>
                <td>—</td>
                <td>—</td>
                <td>0.8711</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>—</td>
                <td>✓</td>
                <td>—</td>
                <td>—</td>
                <td>0.8707</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>—</td>
                <td>—</td>
                <td>✓</td>
                <td>—</td>
                <td>0.8685</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>✓</td>
                <td>0.8578</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>✓</td>
                <td>✓</td>
                <td>—</td>
                <td>—</td>
                <td>0.8754</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>—</td>
                <td>✓</td>
                <td>✓</td>
                <td>—</td>
                <td>0.8780</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>—</td>
                <td>—</td>
                <td>✓</td>
                <td>✓</td>
                <td>0.8722</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>✓</td>
                <td>—</td>
                <td>—</td>
                <td>✓</td>
                <td>0.8741</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>✓</td>
                <td>—</td>
                <td>✓</td>
                <td>—</td>
                <td>0.8796</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td>—</td>
                <td>✓</td>
                <td>—</td>
                <td>✓</td>
                <td>0.8720</td>
              </tr>
              <tr valign="top">
                <td>11</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>—</td>
                <td>
                  <italic>0.8809</italic>
                  <sup>g</sup>
                </td>
              </tr>
              <tr valign="top">
                <td>12</td>
                <td>—</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>0.8769</td>
              </tr>
              <tr valign="top">
                <td>13</td>
                <td>✓</td>
                <td>—</td>
                <td>✓</td>
                <td>✓</td>
                <td>0.8787</td>
              </tr>
              <tr valign="top">
                <td>14</td>
                <td>✓</td>
                <td>✓</td>
                <td>—</td>
                <td>✓</td>
                <td>0.8764</td>
              </tr>
              <tr valign="top">
                <td>15</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>✓</td>
                <td>0.8795</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>IIT-MTL-ClinicalBERT: iterative intermediate training using multi-task learning on ClinicalBERT.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>BioBERT: bidirectional encoder representations from transformers for biomedical text mining.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>MT-DNN: multi-task deep neural networks.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>RoBERTa: robustly optimized bidirectional encoder representations from transformers approach.</p>
            </fn>
            <fn id="table5fn5">
              <p><sup>e</sup>Indicates which language models are included in the ensemble.</p>
            </fn>
            <fn id="table5fn6">
              <p><sup>f</sup>Indicates language model was not used for this experiment.</p>
            </fn>
            <fn id="table5fn7">
              <p><sup>g</sup>Italics signify the highest Pearson correlation coefficient obtained on internal test data set.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>On the basis of the experiments presented in <xref ref-type="table" rid="table5">Table 5</xref>, IIT-MTL-ClinicalBERT &#38; BioBERT &#38; MT-DNN was adopted as the base combination of language models for our official submissions. <xref ref-type="table" rid="table6">Table 6</xref> presents the results of this base combination of language models, with incremental addition of other similarity features using four different ensemble methods. Results are shown for both the internal and withheld external test data sets. Note that the addition of domain-specific and phrasal similarity features has been included in <xref ref-type="table" rid="table6">Table 6</xref> for completeness (although it resulted in lower performance) because it was part of our official submissions.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>End-to-end ensemble module and official submission results.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="247"/>
            <col width="94"/>
            <col width="94"/>
            <col width="94"/>
            <col width="94"/>
            <col width="0"/>
            <col width="94"/>
            <col width="94"/>
            <col width="94"/>
            <col width="95"/>
            <thead>
              <tr valign="top">
                <td rowspan="2">Components</td>
                <td colspan="5">Pearson correlation coefficient on internal test<sup>a</sup></td>
                <td colspan="4">Pearson correlation coefficient on external test<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>Mean</td>
                <td>LR<sup>b</sup></td>
                <td>BR<sup>c</sup></td>
                <td>RR<sup>d</sup></td>
                <td colspan="2">Mean</td>
                <td>LR</td>
                <td>BR</td>
                <td>RR</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>IIT-MTL-ClinicalBERT<sup>e</sup> &#38; MT-DNN<sup>f</sup> &#38; BioBERT<sup>g</sup></td>
                <td>
                  <italic>0.8809</italic>
                </td>
                <td>0.8796</td>
                <td>0.8795</td>
                <td>0.8796</td>
                <td colspan="2">
                  <italic>0.9006</italic>
                </td>
                <td>0.8978</td>
                <td>0.8978</td>
                <td>0.8978</td>
              </tr>
              <tr valign="top">
                <td>+ medication features</td>
                <td>N/A<sup>h</sup></td>
                <td>
                  <italic>0.8841</italic>
                </td>
                <td>0.8832</td>
                <td>0.8831</td>
                <td colspan="2">N/A</td>
                <td>
                  <italic>0.9010</italic>
                </td>
                <td>0.8997</td>
                <td>0.8975</td>
              </tr>
              <tr valign="top">
                <td>+ domain-specific and phrasal similarity features</td>
                <td>N/A</td>
                <td>0.8733</td>
                <td>0.8741</td>
                <td>
                  <italic>0.8799</italic>
                </td>
                <td colspan="2">N/A</td>
                <td>0.8861</td>
                <td>0.8920</td>
                <td>
                  <italic>0.8875</italic>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>Italics signify the Pearson correlation coefficient obtained on the internal and external test data set corresponding to the three configurations (components and ensemble method) that were our official submissions to the 2019 n2c2/OHNLP challenge.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>LR: linear regression.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>BR: Bayesian regression.</p>
            </fn>
            <fn id="table6fn4">
              <p><sup>d</sup>RR: ridge regression.</p>
            </fn>
            <fn id="table6fn5">
              <p><sup>e</sup>IIT-MTL-ClinicalBERT: iterative intermediate training using multi-task learning on ClinicalBERT.</p>
            </fn>
            <fn id="table6fn6">
              <p><sup>f</sup>MT-DNN: multi-task deep neural networks.</p>
            </fn>
            <fn id="table6fn7">
              <p><sup>g</sup>BioBERT: bidirectional encoder representations from transformers for biomedical text mining.</p>
            </fn>
            <fn id="table6fn8">
              <p><sup>h</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Official Submission</title>
        <p>The best performing configurations on the internal test data set, as shown in <xref ref-type="table" rid="table6">Table 6</xref>, were entered as our official submissions to the 2019 n2c2/OHNLP ClinicalSTS challenge. The details of each of our 3 official submissions are as follows:</p>
        <list list-type="bullet">
          <list-item>
            <p>Submission 1: IIT-MTL-ClinicalBERT &#38; MT-DNN &#38; BioBERT</p>
            <list>
              <list-item>
                <p>A statistical mean of the scores produced by the language models, specifically IIT-MTL-ClinicalBERT, MT-DNN, and BioBERT.</p>
              </list-item>
            </list>
            <p/>
          </list-item>
          <list-item>
            <p>Submission 2: IIT-MTL-ClinicalBERT &#38; MT-DNN &#38; BioBERT+medication features</p>
            <list>
              <list-item>
                <p>A linear regression model trained on each component output from Submission 1 and medication features.</p>
              </list-item>
            </list>
            <p/>
          </list-item>
          <list-item>
            <p>Submission 3: IIT-MTL-ClinicalBERT &#38; MT-DNN &#38; BioBERT+medication features+domain-specific and phrasal similarity features</p>
            <list>
              <list-item>
                <p>A ridge regression model trained on all features from Submission 2 and phrasal similarity and domain-specific features.</p>
              </list-item>
            </list>
            <p/>
          </list-item>
        </list>
        <p>Our submission 2 achieved first place out of 87 submitted systems with a PCC of 0.9010 based on the official results. Our submission 1 achieved second place with a PCC of 0.9006.</p>
        <p>With the release of the external test data set, we reran the experiments for language model ensembling on the external test data set. We identified the highest performing configuration on the external test data set as the statistical mean of the scores produced by the combination of IIT-MTL-ClinicalBERT, MT-DNN, and RoBERTa, which resulted in a PCC of 0.9025.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Iterative intermediate training using MTL is an effective way to leverage annotated data from related tasks to improve performance on the target task. However, it is critical to select data sets that can induce contextualized embeddings necessary for the target task. If the network is tasked with making predictions on unrelated tasks, negative transfer may ensue, resulting in lower quality predictions on the target task. Applying IIT-MTL to train ClinicalBERT with related tasks—STS-B, RQE, MedNLI, Topic, and MedNER—resulted in improved performance on the target ClinicalSTS task. However, the addition of QQP to the MTL step resulted in a significant drop in performance. This may be attributed to the fact that, in contrast to the other data sets used, QQP was created for a different sentence pair task (classification rather than regression) on the general domain (as opposed to RQE and MedNLI, which are on the clinical domain). This illustrates the importance of data set selection for the effectiveness of the intermediate multi-task training step.</p>
        <p>Ensembling language models pretrained on domain-specific and domain-independent corpora incorporates different aspects of clinical semantic similarity. <xref ref-type="table" rid="table7">Table 7</xref> presents the ground truth for two sentence pairs, along with predictions from each constituent model. The first sentence pair contains minimal domain-specific terminology; hence, the models trained on domain-independent corpora, MT-DNN and RoBERTa, predicted scores closer to the ground truth. The low ground truth score in the second sentence pair is because of dissimilar clinical concepts within the text; hence, the models trained on domain-specific corpora, IIT-MTL-ClinicalBERT and BioBERT, predicted scores closer to the ground truth.</p>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Sample sentence pairs with ground truth annotations and predictions from three language models used in the final ensembled system.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="211"/>
            <col width="223"/>
            <col width="102"/>
            <col width="152"/>
            <col width="117"/>
            <col width="74"/>
            <col width="121"/>
            <thead>
              <tr valign="top">
                <td>Sentence 1</td>
                <td>Sentence 2</td>
                <td>Ground Truth</td>
                <td colspan="4">Predictions</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>IIT-MTL-ClinicalBERT<sup>a</sup></td>
                <td>BioBERT<sup>b</sup></td>
                <td>MT-DNN<sup>c</sup></td>
                <td>RoBERTa<sup>d</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>“The following consent was read to the patient and accepted to order testing.”</td>
                <td>“We explained the risks, benefits, and alternatives, and the patient agreed to proceed.”</td>
                <td>2.5</td>
                <td>0.61</td>
                <td>1.01</td>
                <td>2.15</td>
                <td>2.51</td>
              </tr>
              <tr valign="top">
                <td>“Negative for coughing up blood, coughing up mucus (phlegm) and wheezing.”</td>
                <td>“Negative for abdominal pain, blood in stool, constipation, diarrhea and vomiting.”</td>
                <td>0.5</td>
                <td>1.04</td>
                <td>1.18</td>
                <td>2.34</td>
                <td>1.74</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup>IIT-MTL-ClinicalBERT: iterative intermediate training using multi-task learning on ClinicalBERT.</p>
            </fn>
            <fn id="table7fn2">
              <p><sup>b</sup>BioBERT: bidirectional encoder representations from transformers for biomedical text mining.</p>
            </fn>
            <fn id="table7fn3">
              <p><sup>c</sup>MT-DNN: multi-task deep neural networks.</p>
            </fn>
            <fn id="table7fn4">
              <p><sup>d</sup>RoBERTa: robustly optimized bidirectional encoder representations from transformers approach.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Analysis of Model Performance</title>
        <p>Our best official submission achieved a PCC of 0.9010 on the external test data set. However, the model performance varies significantly depending on the gold similarity scores. On the low and high ends of the gold scores, [0-2) or [4-5], our model achieves a PCC of 0.9234. However, in the middle range of the gold scores, [2-4), it performs much worse with a PCC of 0.5631. The lower performance in the middle range can be partially attributed to ground truth issues. Weak-to-moderate interannotator agreement (0.6 weighted Cohen kappa) coupled with the lack of an adjudication process (scores from 2 annotators were averaged to provide the gold score), led to concentration of annotation errors in the middle range of the gold scores. For example, greater disagreement between 2 annotators (eg, gold scores 1 and 5) will end up in the middle range (final averaged score 3) as compared with low disagreements (eg, 4 and 5 with the final score of 4.5). The drop in performance in the middle range may also indicate that although our model performs well at distinguishing completely similar or dissimilar sentence pairs, it struggles in scoring sentences with moderate clinical semantic similarity.</p>
        <p>To further investigate this behavior, we studied how predictions varied for each similarity interval using the withheld external test data set. For this, we converted the continuous range gold scores and our model predictions into 5 intervals: [0,1), [1-2), [2-3), [3-4), [4-5]. Using these intervals, we then calculated the F1-score by computing true positives, false positives, and false negatives. A prediction is a true positive if the gold score is in the same similarity interval as the prediction; otherwise, it is termed as false positive (in the predicted interval) and false negative (in the gold interval). Our best model achieves a relatively high F1-score at the extreme ranges (0.77, 0.80, and 0.71 for [0,1), [1-2), [4-5], respectively) but struggles in the middle intervals (0.23 and 0.44 for [2-3) and [3-4), respectively).</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>We acknowledge certain limitations of this study. First, these results are specific to the 2019 n2c2/OHNLP ClinicalSTS data set, which contains clinical text snippets from a single EHR data warehouse (Mayo Clinic EHR data warehouse). Furthermore, the chosen sentence pairs have high surface lexical similarity (ie, candidate pairs must have ≥0.45 average score of Ratcliff/Obershelp pattern matching algorithm, cosine similarity, and Levenshtein distance), which limits the variation in the data set. Thus, there is a need to validate this process on a more diverse ground truth, which (1) contains clinical text from multiple data warehouses and (2) allows for a less restrictive sentence pairing. Second, we observed inconsistencies in the ground truth, which may be inherent to a complex task such as clinical semantic textual similarity. We have made preliminary progress in quantifying these errors and their impact on the results, but more work is needed in this direction. Finally, although our system has achieved high PCC on the ClinicalSTS task, additional research is still needed to understand how to apply this foundational task to the real-world problem of bloated, disorganized clinical documentation.</p>
        <p>Although our system achieved state-of-the-art results in the challenge, the proposed system has following avenues for improvement and further exploration:</p>
        <list list-type="order">
          <list-item>
            <p>The data set selection process in IIT-MTL is largely manual, driven by empirical observations and domain knowledge. Recent developments in automatic machine learning (AutoML), ranging from optimizing hyper-parameters using random search [<xref ref-type="bibr" rid="ref72">72</xref>] to discovering novel neural architectures using reinforcement learning [<xref ref-type="bibr" rid="ref73">73</xref>], have shown promising results. We plan to explore AutoML to relieve this manual effort in the future.</p>
          </list-item>
          <list-item>
            <p>The language model ensemble works well for inducing domain-specific and domain-independent knowledge. However, this process remains largely intuitive. We plan to explore how language modeling objectives influence the domain adaptability of the learned language models on the target task.</p>
          </list-item>
          <list-item>
            <p>At the time of the challenge, we applied our IIT-MTL methodology only to ClinicalBERT because of time constraints. We plan to employ our IIT-MTL methodology on other implemented language models and evaluate their performance.</p>
          </list-item>
          <list-item>
            <p>Our proposed system has a significant computational cost, as we leverage several transformer-based language models. We plan to explore the performance impact of replacing these models with their less computationally expensive counterparts [<xref ref-type="bibr" rid="ref74">74</xref>].</p>
          </list-item>
          <list-item>
            <p>In our experiments, inclusion of domain-specific and phrasal features led to a drop in performance. This is likely because of effective learning of these features by pretrained transformer-based language models, as observed in the general domain [<xref ref-type="bibr" rid="ref75">75</xref>,<xref ref-type="bibr" rid="ref76">76</xref>]. We wish to investigate this behavior further by utilizing probing tasks [<xref ref-type="bibr" rid="ref77">77</xref>] in transformer language models.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we presented an effective methodology leveraging (1) an iterative intermediate training step in a MTL setup and (2) multiple language models pretrained on diverse corpora, which achieved first place in the 2019 ClinicalSTS challenge. This study demonstrates the potential for IIT-MTL to improve the performance of other tasks restricted by limited data sets. This contribution opens new avenues of exploration for optimized data set selection to generate more robust and universal contextual representations of text in the clinical domain.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Data sets used in iterative intermediate training approach using multi-task learning methodology.</p>
        <media xlink:href="medinform_v8i11e22508_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 159 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Experimental settings.</p>
        <media xlink:href="medinform_v8i11e22508_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 159 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Implementation details of other similarity features.</p>
        <media xlink:href="medinform_v8i11e22508_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 86 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BioBERT</term>
          <def>
            <p>bidirectional encoder representations from transformers for biomedical text mining</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">ClinicalBERT</term>
          <def>
            <p>bidirectional encoder representations from transformers on clinical text mining</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ClinicalSTS</term>
          <def>
            <p>clinical semantic textual similarity</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">IIT-MTL-ClinicalBERT</term>
          <def>
            <p>iterative intermediate training using multi-task learning on ClinicalBERT</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">IIT-MTL</term>
          <def>
            <p>iterative intermediate training approach using multi-task learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MedNER</term>
          <def>
            <p>medication named entity recognition data set</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MedNLI</term>
          <def>
            <p>natural language inference data set for the clinical domain</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MIMIC-III</term>
          <def>
            <p>Medical Information Mart for Intensive Care III</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">MT-DNN</term>
          <def>
            <p>multi-task deep neural networks</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">MTL</term>
          <def>
            <p>multi-task learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">n2c2</term>
          <def>
            <p>National Natural Language Processing Clinical Challenges</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">OHNLP</term>
          <def>
            <p>Open Health Natural Language Processing Consortium</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">PCC</term>
          <def>
            <p>Pearson correlation coefficient</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">PMC</term>
          <def>
            <p>PubMed Central</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">QQP</term>
          <def>
            <p>Quora question pairs</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">RoBERTa</term>
          <def>
            <p>robustly optimized bidirectional encoder representations from transformers approach</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">RQE</term>
          <def>
            <p>recognizing question entailment</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">STS-B</term>
          <def>
            <p>semantic textual similarity benchmark</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb22">STS</term>
          <def>
            <p>semantic textual similarity</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors wish to thank Dr Bharath Dandala and Venkata Joopudi for providing valuable feedback on the manuscript.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jamoom</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Furukawa</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>EHR adopters vs non-adopters: impacts of, barriers to, and federal initiatives for EHR adoption</article-title>
          <source>Healthc (Amst)</source>
          <year>2014</year>
          <month>03</month>
          <volume>2</volume>
          <issue>1</issue>
          <fpage>33</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26250087"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.hjdsi.2013.12.004</pub-id>
          <pub-id pub-id-type="medline">26250087</pub-id>
          <pub-id pub-id-type="pii">S2213-0764(13)00084-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McInnes</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Melton</surname>
              <given-names>GB</given-names>
            </name>
          </person-group>
          <article-title>Evaluating measures of redundancy in clinical texts</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2011</year>
          <volume>2011</volume>
          <fpage>1612</fpage>
          <lpage>20</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22195227"/>
          </comment>
          <pub-id pub-id-type="medline">22195227</pub-id>
          <pub-id pub-id-type="pmcid">PMC3243221</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shoolin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ozeran</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hamann</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bria</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Association of medical directors of information systems consensus on inpatient electronic health record documentation</article-title>
          <source>Appl Clin Inform</source>
          <year>2013</year>
          <volume>4</volume>
          <issue>2</issue>
          <fpage>293</fpage>
          <lpage>303</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23874365"/>
          </comment>
          <pub-id pub-id-type="doi">10.4338/ACI-2013-02-R-0012</pub-id>
          <pub-id pub-id-type="medline">23874365</pub-id>
          <pub-id pub-id-type="pmcid">PMC3716423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vogel</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Cut-and-paste clinical notes confuse care, say US internists</article-title>
          <source>Can Med Assoc J</source>
          <year>2013</year>
          <month>12</month>
          <day>10</day>
          <volume>185</volume>
          <issue>18</issue>
          <fpage>E826</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cmaj.ca/cgi/pmidlookup?view=long&#38;pmid=24218539"/>
          </comment>
          <pub-id pub-id-type="doi">10.1503/cmaj.109-4656</pub-id>
          <pub-id pub-id-type="medline">24218539</pub-id>
          <pub-id pub-id-type="pii">cmaj.109-4656</pub-id>
          <pub-id pub-id-type="pmcid">PMC3855142</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dimick</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Documentation bad habits. Shortcuts in electronic records pose risk</article-title>
          <source>J AHIMA</source>
          <year>2008</year>
          <month>06</month>
          <volume>79</volume>
          <issue>6</issue>
          <fpage>40</fpage>
          <lpage>3</lpage>
          <pub-id pub-id-type="medline">18604974</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Khanna</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Najafi</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Characterizing the source of text in electronic health record progress notes</article-title>
          <source>JAMA Intern Med</source>
          <year>2017</year>
          <month>08</month>
          <day>1</day>
          <volume>177</volume>
          <issue>8</issue>
          <fpage>1212</fpage>
          <lpage>3</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28558106"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2017.1548</pub-id>
          <pub-id pub-id-type="medline">28558106</pub-id>
          <pub-id pub-id-type="pii">2629493</pub-id>
          <pub-id pub-id-type="pmcid">PMC5818790</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kroth</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Morioka-Douglas</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Veres</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Babbott</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Poplau</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Qeadan</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Parshall</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Corrigan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Linzer</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Association of electronic health record design and use factors with clinician stress and burnout</article-title>
          <source>JAMA Netw Open</source>
          <year>2019</year>
          <month>08</month>
          <day>2</day>
          <volume>2</volume>
          <issue>8</issue>
          <fpage>e199609</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jamanetwork.com/journals/jamanetworkopen/fullarticle/10.1001/jamanetworkopen.2019.9609"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2019.9609</pub-id>
          <pub-id pub-id-type="medline">31418810</pub-id>
          <pub-id pub-id-type="pii">2748054</pub-id>
          <pub-id pub-id-type="pmcid">PMC6704736</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Botsis</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hartvigsen</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Secondary use of EHR: data quality issues and informatics opportunities</article-title>
          <source>Summit Transl Bioinform</source>
          <year>2010</year>
          <month>03</month>
          <day>1</day>
          <volume>2010</volume>
          <fpage>1</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21347133"/>
          </comment>
          <pub-id pub-id-type="medline">21347133</pub-id>
          <pub-id pub-id-type="pmcid">PMC3041534</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>MedSTS: a resource for clinical semantic textual similarity</article-title>
          <source>Lang Resour Eval</source>
          <year>2018</year>
          <month>10</month>
          <day>24</day>
          <volume>54</volume>
          <issue>1</issue>
          <fpage>57</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.1007/s10579-018-9431-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yanshan</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Sunyang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Feichen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sam</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ozlem</surname>
              <given-names>UH</given-names>
            </name>
          </person-group>
          <article-title>Overview of the 2019 N2C2/OHNLP track on clinical semantic textual similarity</article-title>
          <source>JMIR Med Informatics</source>
          <comment>Preprint posted online August 10, 2020. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/preprint/23375"/></comment>
          <pub-id pub-id-type="doi">10.2196/preprints.23375</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Overview of BioCreative/OHNLP challenge 2018 task 2: clinical semantic textual similarity</article-title>
          <source>Clin Semantic Text Sim</source>
          <comment>Preprint posted online August 2018. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/327424883_Overview_of_BioCreativeOHNLP_Challenge_2018_Task_2_Clinical_Semantic_Textual_Similarity"/></comment>
          <pub-id pub-id-type="doi">10.13140/RG.2.2.26682.24006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>BioCreative/OHNLP Challenge 2018</article-title>
          <source>Proceedings of the 2018 ACM International Conference on Bioinformatics, Computational Biology, and Health Informatics</source>
          <year>2018</year>
          <month>08</month>
          <conf-name>ACM-BCB'18</conf-name>
          <conf-date>August 29-September 1, 2018</conf-date>
          <conf-loc>Washington, DC</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3233547.3233672</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agirre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Diab</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Agirre</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Task 6: A Pilot on Semantic Textual Similarity</article-title>
          <source>First Joint Conference on Lexical and Computational Semantics – Volume 1: Proceedings of the Main Conference and the Shared Task, and Volume 2: Proceedings of the Sixth International Workshop on Semantic Evaluation (Semeval 2012)</source>
          <year>2012</year>
          <conf-name>SEM'12</conf-name>
          <conf-date>June 7-8, 2012</conf-date>
          <conf-loc>Montreal, Canada</conf-loc>
          <fpage>385</fpage>
          <lpage>93</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/s15-2045</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agirre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Diab</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Agirre</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Shared task: Semantic Textual Similarity</article-title>
          <source>Second Joint Conference on Lexical and Computational Semantics</source>
          <year>2013</year>
          <conf-name>SEM'13</conf-name>
          <conf-date>June 13-14, 2013</conf-date>
          <conf-loc>Atlanta, Georgia, USA</conf-loc>
          <fpage>32</fpage>
          <lpage>43</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agirre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Banea</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cardie</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Diab</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Agirre</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Mihalcea</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rigau</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wiebe</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Task 10: Multilingual Semantic Textual Similarity</article-title>
          <source>Proceedings of the 8th International Workshop on Semantic Evaluation</source>
          <year>2014</year>
          <conf-name>SemEval'14</conf-name>
          <conf-date>August 23-24, 2014</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <fpage>81</fpage>
          <lpage>91</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/s14-2010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agirre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Banea</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cardie</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Diab</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Agirre</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez-Gazpio</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Maritxalar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mihalcea</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rigau</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Uria</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wiebe</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Task 2: Semantic Textual Similarity, English, Spanish and Pilot on Interpretability</article-title>
          <source>Proceedings of the 9th International Workshop on Semantic Evaluation</source>
          <year>2015</year>
          <conf-name>SemEval'15</conf-name>
          <conf-date>June 4-5, 2015</conf-date>
          <conf-loc>Denver, Colorado</conf-loc>
          <fpage>252</fpage>
          <lpage>63</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/s15-2045</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agirre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Banea</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Diab</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Agirre</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mihalcea</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rigau</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wiebe</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Task 1: Semantic Textual Similarity, Monolingual and Cross-Lingual Evaluation</article-title>
          <source>Proceedings of the 10th International Workshop on Semantic Evaluation</source>
          <year>2016</year>
          <conf-name>SemEval'16</conf-name>
          <conf-date>June 16-17, 2016</conf-date>
          <conf-loc>San Diego, California</conf-loc>
          <fpage>497</fpage>
          <lpage>511</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/s16-1081</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Diab</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Agirre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez-Gazpio</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Specia</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation</article-title>
          <source>Proceedings of the 11th International Workshop on Semantic Evaluation</source>
          <year>2017</year>
          <conf-name>SemEval'17</conf-name>
          <conf-date>August 3-4, 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>1</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/s17-2001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</source>
          <year>2019</year>
          <conf-name>NAACL HLT'19</conf-name>
          <conf-date>June 2-7, 2019</conf-date>
          <conf-loc>Minneapolis, Minnesota</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Phang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Févry</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Sentence encoders on STILTs: supplementary training on intermediate labeled-data tasks</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1811.01088"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Multi-Task Deep Neural Networks for Natural Language Understanding</article-title>
          <source>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2019</year>
          <conf-name>ACL'19</conf-name>
          <conf-date>July 28-August 2, 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/p19-1441</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dolan</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Brockett</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Automatically Constructing a Corpus of Sentential Paraphrases</article-title>
          <source>Third International Workshop on Paraphrasing</source>
          <year>2005</year>
          <conf-name>IWP'05</conf-name>
          <conf-date>October 11-13, 2005</conf-date>
          <conf-loc>Jeju Island, Korea</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/I05-5002.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nangia</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference</article-title>
          <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)</source>
          <year>2018</year>
          <conf-name>NAACL-HLT'18</conf-name>
          <conf-date>June 1-6, 2018</conf-date>
          <conf-loc>New Orleans, Louisiana</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1101</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Romanov</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shivade</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Lessons from Natural Language Inference in the Clinical Domain</article-title>
          <source>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2018</year>
          <conf-name>EMNLP'18</conf-name>
          <conf-date>October 31-November 4, 2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d18-1187</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Angeli</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Potts</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A Large Annotated Corpus for Learning Natural Language Inference</article-title>
          <source>Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2015</year>
          <conf-name>EMNLP'15</conf-name>
          <conf-date>September 17-21, 2015</conf-date>
          <conf-loc>Lisbon, Portugal</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d15-1075</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saríc</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Glavaš</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Karan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Šnajder</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bašić</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>TakeLab: Systems for Measuring Semantic Text Similarity</article-title>
          <source>The First Joint Conference on Lexical and Computational Semantics – Volume 1: Proceedings of the main conference and the shared task, and Volume 2: Proceedings of the Sixth International Workshop on Semantic Evaluation</source>
          <year>2012</year>
          <conf-name>SEM'12</conf-name>
          <conf-date>June 7-8, 2012</conf-date>
          <conf-loc>Montreal, Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jimenez</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Becerra</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gelbukh</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Soft Cardinality: A Parameterized Similarity Function for Text Comparison</article-title>
          <source>The First Joint Conference on Lexical and Computational Semantics – Volume 1: Proceedings of the main conference and the shared task, and Volume 2: Proceedings of the Sixth International Workshop on Semantic Evaluation</source>
          <year>2012</year>
          <conf-name>SEM'12</conf-name>
          <conf-date>June 7-8, 2012</conf-date>
          <conf-loc>Montreal, Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bär</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Biemann</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gurevych</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Zesch</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>UKP: Computing Semantic Textual Similarity by Combining Multiple Content Similarity Measures</article-title>
          <source>The First Joint Conference on Lexical and Computational Semantics – Volume 1: Proceedings of the main conference and the shared task, and Volume 2: Proceedings of the Sixth International Workshop on Semantic Evaluation</source>
          <year>2012</year>
          <conf-name>SEM'12</conf-name>
          <conf-date>June 7-8, 2012</conf-date>
          <conf-loc>Montreal, Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed Representations of Words and Phrases and Their Compositionality</article-title>
          <source>Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2</source>
          <year>2013</year>
          <conf-name>NIPS'13</conf-name>
          <conf-date>December 5-10, 2013</conf-date>
          <conf-loc>Lake Tahoe, Nevada</conf-loc>
          <fpage>3111</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.5555/2999792.2999959</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hanson</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Musicassette interchangeability: the facts behind the facts</article-title>
          <source>AES J Audio Eng Soc</source>
          <year>1971</year>
          <volume>19</volume>
          <issue>5</issue>
          <fpage>-</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://r.search.yahoo.com/_ylt=Awrxgqr31qFfAjUAUB3nHgx.;_ylu=Y29sbwMEcG9zAzEEdnRpZAMEc2VjA3Ny/RV=2/RE=1604470648/RO=10/RU=http%3a%2f%2fwww.aes.org%2fe-lib%2fbrowse.cfm%3felib%3d1333/RK=2/RS=0Cp_7HUNhL0tGAvI2RnYlBRTrag-"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wieting</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bansal</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gimpel</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Livescu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>From paraphrase database to compositional paraphrase model and back</article-title>
          <source>Trans Assoc Comput Linguist</source>
          <year>2015</year>
          <month>12</month>
          <volume>3</volume>
          <fpage>345</fpage>
          <lpage>58</lpage>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00143</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Bag of Tricks for Efficient Text Classification</article-title>
          <source>Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers</source>
          <year>2017</year>
          <conf-name>EACL'17</conf-name>
          <conf-date>April 3-7, 2017</conf-date>
          <conf-loc>Valencia, Spain</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/e17-2068</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Distributed Representations of Sentences and Documents</article-title>
          <source>Proceedings of the 31st International Conference on Machine Learning</source>
          <year>2014</year>
          <conf-name>ICML'14</conf-name>
          <conf-date>June 21–26, 2014</conf-date>
          <conf-loc>Beijing, China</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lau</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Baldwin</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>An Empirical Evaluation of DOC2VEC with Practical Insights into Document Embedding Generation</article-title>
          <source>Proceedings of the 1st Workshop on Representation Learning for NLP</source>
          <year>2016</year>
          <month>08</month>
          <conf-name>REPL4NLP'16</conf-name>
          <conf-date>August 11, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>78</fpage>
          <lpage>86</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w16-1609</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pagliardini</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Jaggi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised Learning of Sentence Embeddings Using Compositional n-Gram Features</article-title>
          <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)</source>
          <year>2018</year>
          <conf-name>NAACL-HLT'18</conf-name>
          <conf-date>June 1-6, 2018</conf-date>
          <conf-loc>New Orleans, Louisiana</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1049</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conneau</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kiela</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schwenk</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Barrault</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Supervised Learning of Universal Sentence Representations from Natural Language Inference Data</article-title>
          <source>Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2017</year>
          <conf-name>EMNLP'17</conf-name>
          <conf-date>September 7–11, 2017</conf-date>
          <conf-loc>Copenhagen, Denmark</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d17-1070</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arora</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A Simple but Tough-to-beat Baseline for Sentence Embeddings</article-title>
          <source>5th International Conference on Learning Representations</source>
          <year>2017</year>
          <conf-name>ICLR'17</conf-name>
          <conf-date>April 24-26, 2017</conf-date>
          <conf-loc>Toulon, France</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Task 1: Use convolutional neural network to evaluate Semantic Textual Similarity</article-title>
          <source>Proceedings of the 11th International Workshop on Semantic Evaluation</source>
          <year>2017</year>
          <conf-name>SemEval'17</conf-name>
          <conf-date>August 3-4, 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/s17-2016</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Acero</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Heck</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Learning Deep Structured Semantic Models for Web Search Using Clickthrough Data</article-title>
          <source>Proceedings of the 22nd ACM International Conference on Information &#38; Knowledge Management</source>
          <year>2013</year>
          <conf-name>CIKM'13</conf-name>
          <conf-date>October 1, 2016</conf-date>
          <conf-loc>San Franciso, California</conf-loc>
          <pub-id pub-id-type="doi">10.1145/2505515.2505665</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ruder</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Universal Language Model Fine-Tuning for Text Classification</article-title>
          <source>Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>
          <year>2018</year>
          <conf-name>ACL'18</conf-name>
          <conf-date>July 15-20, 2018</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/p18-1031</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Narasimhan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Salimans</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Improving Language Understanding by Generative Pre-Training</article-title>
          <source>Semantic Scholar</source>
          <year>2018</year>
          <access-date>2020-11-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf">https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>?</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is All You Need</article-title>
          <source>Proceedings of the 31st International Conference on Neural Information Processing Systems</source>
          <year>2017</year>
          <conf-name>NIPS'17</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <fpage>2017</fpage>
          <pub-id pub-id-type="doi">10.5555/3295222.3295349</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michael</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hill</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding</article-title>
          <source>International Conference on Learning Representations</source>
          <year>2019</year>
          <conf-name>ICLR'19</conf-name>
          <conf-date>May 6-9, 2019</conf-date>
          <conf-loc>New Orleans</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>40</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Altosaar</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ranganath</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>ClinicalBERT: modeling clinical notes and predicting hospital readmission</article-title>
          <source>arxiv</source>
          <year>2019</year>
          <fpage>-</fpage>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1904.05342"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Publicly Available Clinical BERT Embeddings</article-title>
          <source>Proceedings of the 2nd Clinical Natural Language Processing Workshop</source>
          <year>2019</year>
          <conf-name>ClinicalNLP'19</conf-name>
          <conf-date>June 7, 2019</conf-date>
          <conf-loc>Minneapolis, Minnesota</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1909</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>A survey on multi-task learning</article-title>
          <source>arXiv</source>
          <year>2017</year>
          <fpage>-</fpage>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1707.08114"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wilbur</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Combining Rich Features and Deep Learning for Finding Similar Sentences in Electronic Medical Records</article-title>
          <source>Proceedings of the BioCreative/OHNLP Challenge</source>
          <year>2018</year>
          <conf-name>OHNLP'18</conf-name>
          <conf-date>September 1-8, 2018</conf-date>
          <conf-loc>Washington, DC</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/327402060_Combining_rich_features_and_deep_learning_for_finding_similar_sentences_in_electronic_medical_records"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Task 1: Leverage Kernel-based Traditional NLP features and Neural Networks to Build a Universal Model for Multilingual and Cross-lingual Semantic Textual Similarity</article-title>
          <source>Proceedings of the 11th International Workshop on Semantic Evaluation</source>
          <year>2017</year>
          <conf-name>SemEval'17</conf-name>
          <conf-date>August 3-4, 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/s17-2028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sultan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bethard</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sumner</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>DLSCU: Sentence Similarity from Word Alignment</article-title>
          <source>Proceedings of the 8th International Workshop on Semantic Evaluation</source>
          <year>2014</year>
          <conf-name>SemEval'14</conf-name>
          <conf-date>August 23-24, 2014</conf-date>
          <conf-loc>Denver, Colorado</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/s14-2039</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sultan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bethard</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sumner</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>DLSCU: Sentence Similarity from Word Alignment and Semantic Vector Composition</article-title>
          <source>Proceedings of the 9th International Workshop on Semantic Evaluation</source>
          <year>2015</year>
          <conf-name>SemEval'15</conf-name>
          <conf-date>June 4-5, 2015</conf-date>
          <conf-loc>Denver, Colorado</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/s15-2027</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Limtiaco</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>John</surname>
              <given-names>RS</given-names>
            </name>
          </person-group>
          <article-title>Universal sentence encoder</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <fpage>-</fpage>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1803.11175"/></comment>
          <pub-id pub-id-type="doi">10.18653/v1/d18-2029</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mulyar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>McInnes</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>MT-clinical BERT: scaling clinical information extraction with multitask learning</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <fpage>-</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2004.10220"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>An Empirical Study of Multi-Task Learning on BERT for Biomedical Text Mining</article-title>
          <source>Proceedings of the 19th SIGBioMed Workshop on Biomedical Language Processing</source>
          <year>2020</year>
          <conf-name>BioNLP'20</conf-name>
          <conf-date>July 9, 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2020.bionlp-1.22</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="web">
          <article-title>Ratcliff/Obershelp Pattern Recognition</article-title>
          <source>NIST: National Institute of Standards and Technology</source>
          <year>2004</year>
          <access-date>2020-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html">https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Elayavilli</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mehrabi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A Frequency-filtering Strategy of Obtaining PHI-free Sentences From Clinical Data Repository</article-title>
          <source>Proceedings of the 6th ACM Conference on Bioinformatics, Computational Biology and Health Informatics</source>
          <year>2015</year>
          <conf-name>BCB'15</conf-name>
          <conf-date>September 9–12, 2015</conf-date>
          <conf-loc>Atlanta, GA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/2808719.2808752</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abacha</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Dina</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Recognizing Question Entailment for Medical Question Answering</article-title>
          <source>Proceedings of the Annual Symposium</source>
          <year>2016</year>
          <conf-name>AMIA'16</conf-name>
          <conf-date>June 12-18, 2016</conf-date>
          <conf-loc>Chicago, Illinois</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Graesser</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nangia</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Evci</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Natural language understanding with the quora question pairs dataset</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1907.01041.pdf"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.35"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Stenner</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Doan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>KB</given-names>
            </name>
            <name name-style="western">
              <surname>Waitman</surname>
              <given-names>LR</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>MedEx: a medication information extraction system for clinical narratives</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>1</issue>
          <fpage>19</fpage>
          <lpage>24</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20064797"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M3378</pub-id>
          <pub-id pub-id-type="medline">20064797</pub-id>
          <pub-id pub-id-type="pii">17/1/19</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995636</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ely</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Osheroff</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Gorman</surname>
              <given-names>PN</given-names>
            </name>
            <name name-style="western">
              <surname>Ebell</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Chambliss</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Pifer</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Stavri</surname>
              <given-names>PZ</given-names>
            </name>
          </person-group>
          <article-title>A taxonomy of generic clinical questions: classification study</article-title>
          <source>Br Med J</source>
          <year>2000</year>
          <month>08</month>
          <day>12</day>
          <volume>321</volume>
          <issue>7258</issue>
          <fpage>429</fpage>
          <lpage>32</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/10938054"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.321.7258.429</pub-id>
          <pub-id pub-id-type="medline">10938054</pub-id>
          <pub-id pub-id-type="pmcid">PMC27459</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="web">
          <article-title>Quora Question Pairs</article-title>
          <source>Kaggle</source>
          <access-date>2020-11-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.kaggle.com/c/quora-question-pairs">https://www.kaggle.com/c/quora-question-pairs</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Schuster</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>QV</given-names>
            </name>
          </person-group>
          <article-title>Google's neural machine translation system: bridging the gap between human and machine translation</article-title>
          <source>arXiv</source>
          <year>2016</year>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1609.08144.pdf"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="web">
          <article-title>namisan/mt-dnn: Multi-Task Deep Neural Networks for Natural Language Understanding</article-title>
          <source>GitHub</source>
          <access-date>2020-11-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/namisan/mt-dnn">https://github.com/namisan/mt-dnn</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="web">
          <source>International Conference on Learning Representations ICLR</source>
          <year>2015</year>
          <access-date>2020-11-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1412.6980.pdf">https://arxiv.org/pdf/1412.6980.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stoyanov</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>RoBERTa: A robustly optimized bert pretraining approach</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <access-date>2020-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1907.11692">http://arxiv.org/abs/1907.11692</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lopyrev</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>SQuAD: 100,000+ Questions for Machine Comprehension of Text</article-title>
          <source>Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2016</year>
          <conf-name>EMNLP'16</conf-name>
          <conf-date>November 1-5, 2016</conf-date>
          <conf-loc>Austin, Texas</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d16-1264</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dagan</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Glickman</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Magnini</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <source>The PASCAL Recognising Textual Entailment Challenge</source>
          <year>2006</year>
          <publisher-loc>Berlin, Heidelberg</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mahajan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tsou</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Extracting Daily Dosage from Medication Instructions in EHRs: An Automated Approach and Lessons Learned</article-title>
          <source>arXiv org</source>
          <year>2020</year>
          <access-date>2020-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/2005.10899.pdf">https://arxiv.org/pdf/2005.10899.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref70">
        <label>70</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lindberg</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Humphreys</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>McCray</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The unified medical language system</article-title>
          <source>Methods Inf Med</source>
          <year>2018</year>
          <month>02</month>
          <day>06</day>
          <volume>32</volume>
          <issue>04</issue>
          <fpage>281</fpage>
          <lpage>291</lpage>
          <pub-id pub-id-type="doi">10.1055/s-0038-1634945</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref71">
        <label>71</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>GA</given-names>
            </name>
          </person-group>
          <article-title>WordNet: a lexical database for English</article-title>
          <source>Commun ACM</source>
          <year>1995</year>
          <month>11</month>
          <volume>38</volume>
          <issue>11</issue>
          <fpage>39</fpage>
          <lpage>41</lpage>
          <pub-id pub-id-type="doi">10.1145/219717.219748</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref72">
        <label>72</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bergstra</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bardenet</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kégl</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Algorithms for Hyper-parameter Optimization</article-title>
          <source>Proceedings of the 24th International Conference on Neural Information Processing Systems</source>
          <year>2011</year>
          <conf-name>NIPS'11</conf-name>
          <conf-date>December 12-14, 2011</conf-date>
          <conf-loc>Granada, Spain</conf-loc>
          <pub-id pub-id-type="doi">10.5555/2986459.2986743</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref73">
        <label>73</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Practical Block-Wise Neural Network Architecture Generation</article-title>
          <source>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source>
          <year>2018</year>
          <conf-name>IEEE'18</conf-name>
          <conf-date>June 18-22, 2018</conf-date>
          <conf-loc>Salt Lake City, UT</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2018.00257</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref74">
        <label>74</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sanh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Debut</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chaumond</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>DistilBERT, a distilled version of BERT</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <volume>2</volume>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1910.01108"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref75">
        <label>75</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tenney</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pavlick</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>BERT Rediscovers the Classical NLP Pipeline</article-title>
          <source>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2019</year>
          <conf-name>ACL'19</conf-name>
          <conf-date>July 28-August 2, 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/p19-1452</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref76">
        <label>76</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rogers</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kovaleva</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Rumshisky</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A primer in BERTology</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2002.12327"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref77">
        <label>77</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tenney</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Poliak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Van</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pavlick</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>What Do You Learn From Context? Probing for Sentence Structure in Contextualized Word Representations</article-title>
          <source>Seventh International Conference on Learning Representations</source>
          <year>2019</year>
          <conf-name>ICLR'19</conf-name>
          <conf-date>May 6-9, 2019</conf-date>
          <conf-loc>New Orleans</conf-loc>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
