<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i12e27386</article-id>
      <article-id pub-id-type="pmid">34967748</article-id>
      <article-id pub-id-type="doi">10.2196/27386</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Benchmarking Effectiveness and Efficiency of Deep Learning Models for Semantic Textual Similarity in the Clinical Domain: Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Wang</surname>
            <given-names>Yanshan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Webb-Robertson</surname>
            <given-names>Bobbie-Jo</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Manzanares</surname>
            <given-names>Maria</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Qingyu</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6036-1516</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Rankine</surname>
            <given-names>Alex</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1581-2066</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Peng</surname>
            <given-names>Yifan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9309-8331</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Aghaarabi</surname>
            <given-names>Elaheh</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6641-333X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Lu</surname>
            <given-names>Zhiyong</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>National Center for Biotechnology Information, National Library of Medicine, National Institutes of Health</institution>
            <addr-line>8600 Rockville Pike</addr-line>
            <addr-line>Bethesda, MD, 20894</addr-line>
            <country>United States</country>
            <phone>1 301 594 7089</phone>
            <email>luzh@ncbi.nlm.nih.gov</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9998-916X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>National Center for Biotechnology Information, National Library of Medicine, National Institutes of Health</institution>
        <addr-line>Bethesda, MD</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Harvard College</institution>
        <addr-line>Cambridge, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Weill Cornell Medicine</institution>
        <addr-line>New York, NY</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Towson University</institution>
        <addr-line>Towson, MD</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Zhiyong Lu <email>luzh@ncbi.nlm.nih.gov</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>12</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>30</day>
        <month>12</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>12</issue>
      <elocation-id>e27386</elocation-id>
      <history>
        <date date-type="received">
          <day>22</day>
          <month>1</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>16</day>
          <month>3</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>6</day>
          <month>8</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>6</day>
          <month>8</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Qingyu Chen, Alex Rankine, Yifan Peng, Elaheh Aghaarabi, Zhiyong Lu. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 30.12.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2021/12/e27386" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Semantic textual similarity (STS) measures the degree of relatedness between sentence pairs. The Open Health Natural Language Processing (OHNLP) Consortium released an expertly annotated STS data set and called for the National Natural Language Processing Clinical Challenges. This work describes our entry, an ensemble model that leverages a range of deep learning (DL) models. Our team from the National Library of Medicine obtained a Pearson correlation of 0.8967 in an official test set during 2019 National Natural Language Processing Clinical Challenges/Open Health Natural Language Processing shared task and achieved a second rank.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Although our models strongly correlate with manual annotations, annotator-level correlation was only moderate (weighted Cohen <italic>κ</italic>=0.60). We are cautious of the potential use of DL models in production systems and argue that it is more critical to evaluate the models in-depth, especially those with extremely high correlations. In this study, we benchmark the effectiveness and efficiency of top-ranked DL models. We quantify their robustness and inference times to validate their usefulness in real-time applications.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We benchmarked five DL models, which are the top-ranked systems for STS tasks: Convolutional Neural Network, BioSentVec, BioBERT, BlueBERT, and ClinicalBERT. We evaluated a random forest model as an additional baseline. For each model, we repeated the experiment 10 times, using the official training and testing sets. We reported 95% CI of the Wilcoxon rank-sum test on the average Pearson correlation (official evaluation metric) and running time. We further evaluated Spearman correlation, R², and mean squared error as additional measures.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Using only the official training set, all models obtained highly effective results. BioSentVec and BioBERT achieved the highest average Pearson correlations (0.8497 and 0.8481, respectively). BioSentVec also had the highest results in 3 of 4 effectiveness measures, followed by BioBERT. However, their robustness to sentence pairs of different similarity levels varies significantly. A particular observation is that BERT models made the most errors (a mean squared error of over 2.5) on highly similar sentence pairs. They cannot capture highly similar sentence pairs effectively when they have different negation terms or word orders. In addition, time efficiency is dramatically different from the effectiveness results. On average, the BERT models were approximately 20 times and 50 times slower than the Convolutional Neural Network and BioSentVec models, respectively. This results in challenges for real-time applications.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Despite the excitement of further improving Pearson correlations in this data set, our results highlight that evaluations of the effectiveness and efficiency of STS models are critical. In future, we suggest more evaluations on the generalization capability and user-level testing of the models. We call for community efforts to create more biomedical and clinical STS data sets from different perspectives to reflect the multifaceted notion of sentence-relatedness.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>semantic textual similarity</kwd>
        <kwd>deep learning</kwd>
        <kwd>biomedical and clinical text mining</kwd>
        <kwd>word embeddings</kwd>
        <kwd>sentence embeddings</kwd>
        <kwd>transformers</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Semantic textual similarity (STS), a measure of the degree of relatedness between sentence pairs, is an important text-mining research topic [<xref ref-type="bibr" rid="ref1">1</xref>]. STS has been widely used in biomedical and clinical domains, including information retrieval (finding relevant sentences or passages [<xref ref-type="bibr" rid="ref2">2</xref>]), biocuration (finding key sentences for evidence attribution [<xref ref-type="bibr" rid="ref3">3</xref>]), and question answering (finding answer-snippet candidates [<xref ref-type="bibr" rid="ref4">4</xref>]). Despite its importance, expertly annotated STS data sets are lacking in the biomedical and clinical domains. For example, STS-related data sets in the general domain have been developed for nearly a decade, with almost 30,000 annotated sentence pairs in total [<xref ref-type="bibr" rid="ref5">5</xref>], whereas similar data sets in the biomedical and clinical domains had only hundreds of pairs in total before 2018 [<xref ref-type="bibr" rid="ref6">6</xref>]. The organizers of the Open Health Natural Language Processing (OHNLP) Consortium have dedicated efforts to expanding such data sets and establishing STS open challenges in the clinical domain since 2018. MEDSTS [<xref ref-type="bibr" rid="ref7">7</xref>], consisting of 1068 curated sentence pairs, was used in the BioCreative/OHNLP challenge task in 2018 [<xref ref-type="bibr" rid="ref8">8</xref>]. In 2019, over 1000 curated sentence pairs were added to the MEDSTS, renamed ClinicalSTS [<xref ref-type="bibr" rid="ref9">9</xref>], which was used in the National Natural Language Processing Clinical Challenges (n2c2)/OHNLP. This work is a poststudy of the n2c2/OHNLP challenge.</p>
        <p>Overall, 33 teams submitted 87 models to the n2c2/OHNLP challenge task; Pearson correlation was used as the evaluation measure, ranging from −1 (strong negative relationship) to 1 (strong positive relationship). Our National Library of Medicine and National Center for Biotechnology Information team developed an ensemble model by leveraging a range of deep learning models from 3 categories: word embedding based, sentence embedding based, and transformer based (which is described in the following sections). This model achieved a Pearson correlation of 0.8967 in the official test set, ranking second among all of the teams (<italic>P</italic>=.88 compared with the first rank, with a Pearson correlation of 0.9010). The top 10 best team submissions demonstrated relatively close performances with Pearson correlations of 0.85 to 0.90. According to the organizer’s overview, most of the top systems used deep learning models [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
        <p>A Pearson correlation of approximately 0.9 suggests that the model’s predictions have a very strong correlation with gold standard annotations [<xref ref-type="bibr" rid="ref10">10</xref>]. Such results might give the impression that deep learning models have already solved STS in the clinical domain. Nevertheless, the human-level correlation in this data set is significantly lower; for example, the agreement between 2 annotators in ClinicalSTS had a weighted Cohen <italic>κ</italic> of 0.6 [<xref ref-type="bibr" rid="ref9">9</xref>], suggesting that only a moderate level of correlation was achieved by human experts [<xref ref-type="bibr" rid="ref10">10</xref>]. Therefore, we urge caution with regard to the extremely high correlation achieved by the models (which might be potentially due to overfitting) and argue that it is critical to understand how these models perform in reality rather than further improve the performance in this data set. Therefore, in this postchallenge study, we aim to analyze the effectiveness and efficiency of 5 deep learning models in depth:</p>
        <list list-type="bullet">
          <list-item>
            <p>For effectiveness, we investigate how a single deep learning model performs in this specific data set and further analyze the robustness of models in sentence pairs of different degrees of similarity.</p>
          </list-item>
          <list-item>
            <p>For efficiency, we measure the inference time taken by the deep learning models in the testing set. This is an important indicator of whether these models can be used in real-time applications, such as sentence search engines. To the best of our knowledge, few studies on STS in the biomedical and clinical domains have considered model efficiency. However, given that models have already achieved a Pearson correlation of approximately 0.90, measuring efficiency is arguably more important, as it quantifies whether these models could be used in production.</p>
          </list-item>
        </list>
        <p>The principal findings are 2-fold. First, a single deep learning model trained directly on the official training set only (ie, without more advanced techniques, such as multitask learning and transfer learning) could already achieve a maximum Pearson correlation of 0.87; however, the training set’s robustness to sentence pairs of different similarity levels differs significantly. A particular observation is that BERT models made the most errors (a mean squared error of over 2.5) on highly similar sentence pairs (similarity no less than 4). BERT models cannot capture highly similar sentence pairs effectively when they have different negation terms or word orders. Second, although the deep learning models achieved relatively close Pearson correlations (from 0.82 to 0.87; single models), the time efficiency differed dramatically. For example, the difference in Pearson correlations of BERT and sentence embedding models was within 0.002, but the inference time of BERT models was approximately 50 times greater than that of sentence embedding models. This brings practical challenges to using BERT models in real-time applications, especially without the availability of graphics processing units (GPUs). Furthermore, although there has been a tremendous effort to make ClinicalSTS available to the community, their source corpora inevitably limit the diversity of sentence pairs and annotation inconsistencies. Thus, we call for community efforts to create more STS data sets from different perspectives to reflect the multifaceted notion of sentence relatedness; this, in turn, will further improve the generalization performance of deep learning models.</p>
        <p>Here, we introduce popular deep learning STS methods that have been used in the biomedical and clinical domains. The methods are broadly categorized in terms of the language models applied: word embeddings, sentence embeddings, and transformers.</p>
      </sec>
      <sec>
        <title>Word Embedding–Based Models</title>
        <p>Word embeddings are relatively early language models that significantly change how text is modeled. The semantic of each word is represented in a high-dimensional vector trained on large-scale corpora in an unsupervised manner. Primary word embedding methods include (1) word2vec, based on local contexts, such as using a word as input to predict its nearby words [<xref ref-type="bibr" rid="ref11">11</xref>]; (2) Glove, based on global co-occurrence statistics [<xref ref-type="bibr" rid="ref12">12</xref>]; and (3) fastText, which extends word2vec by adding word n-grams [<xref ref-type="bibr" rid="ref13">13</xref>]. Many word embedding variations (eg, pretrained in the biomedical or clinical corpora, integrated with entities, and adopted retrofitting methods) are publicly available [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. First, word embedding–based STS models use these embeddings to obtain vector representations of the words in sentence pairs and then use either Convolutional Neural Networks (CNNs) or recurrent neural networks to process (typically to obtain spatial or semantic patterns), followed by fully-connected layers to make predictions [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
      </sec>
      <sec>
        <title>Sentence Embedding–Based Models</title>
        <p>Sentence embeddings extend word embeddings by modeling sentence-level representations. The primary methods include (1) Doc2vec, similar to word2vec, using a word as input and predicting the paragraph rather than nearby words [<xref ref-type="bibr" rid="ref17">17</xref>]; (2) FastSent, using a sentence as input and predicting the adjacent sentences [<xref ref-type="bibr" rid="ref18">18</xref>]; and (3) SentVec, which extends word2vec and fastText by using both words (and their n-grams) and the associated sentences as inputs for training [<xref ref-type="bibr" rid="ref19">19</xref>]. Compared with word embedding–based models, sentence embedding–based STS models are simpler: first, they use sentence embeddings to obtain sentence vectors and then use fully-connected layers for predictions [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
      </sec>
      <sec>
        <title>Transformer-Based Models</title>
        <p>Transformers are recent language models that revolutionize text representation methods. Using a self-attention mechanism, this model can capture long-range dependencies [<xref ref-type="bibr" rid="ref21">21</xref>]. Transformer-based language models, such as BERT [<xref ref-type="bibr" rid="ref22">22</xref>] and GPT [<xref ref-type="bibr" rid="ref23">23</xref>], have replaced recurrent neural networks for many text-based applications. To date, many transformers pretrained in the general or biomedical and clinical domains are publicly available [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref27">27</xref>]. Similar to sentence embedding–based models, transformer-based STS models directly use transformers to obtain sentence representations and then use fully-connected layers for predictions [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Sentence Similarity Models</title>
        <sec>
          <title>Overview</title>
          <p>Five deep learning STS models from the 3 categories above were benchmarked: the Convolutional Neural Network (CNN) model [<xref ref-type="bibr" rid="ref28">28</xref>] (from the word embedding–based category), the sentence embedding model, using BioSentVec [<xref ref-type="bibr" rid="ref29">29</xref>] (from the sentence embedding–based category), and transformer models (from the transformer-based category), using BioBERT [<xref ref-type="bibr" rid="ref24">24</xref>], BlueBERT [<xref ref-type="bibr" rid="ref25">25</xref>], and ClinicalBERT [<xref ref-type="bibr" rid="ref26">26</xref>]. We chose these models because they achieved top-ranked performance in STS-based tasks [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. The general architecture is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, and the descriptions are as follows.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Model architecture overview. (A), (B), and (C) demonstrate the architecture of the Convolutional Neural Network (CNN), BioSentVec, and Bidirectional Encoder Representations from Transformers models, respectively. Details are provided in the Methods section. BERT: Bidirectional Encoder Representations from Transformers; CONV: convolutional layer; FC: fully-connected layer.</p>
            </caption>
            <graphic xlink:href="medinform_v9i12e27386_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Word Embedding–Based Model (CNN Model)</title>
          <p>We adapted the CNN model from a study by Shao [<xref ref-type="bibr" rid="ref28">28</xref>] a top-ranked system in SemEval-2017 Task 1. The CNN model transforms the input sentence pair into vectors and learns the similarities between the corresponding vectors. The backbone is a Siamese neural network, whereby the model weights are shared when processing the 2 input sentences. The model consists of 3 layers (shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>A). The first embedding layer consists of word and character embeddings. It is used to transform the raw text into a 2D semantic vector space. In this study, we evaluated several word embeddings. We found that word embeddings pretrained in the biomedical and clinical domains did not have additional advantages in this specific data set. This observation is consistent with the previous word embedding evaluation using the same source data set [<xref ref-type="bibr" rid="ref15">15</xref>]. Therefore, we used Glove pretrained in the general domain for the following experiments. The second layer consists of convolutional and max-pooling layers to extract special information from the embeddings. Therefore, the 2D semantic vector space is transformed into a 1D vector to represent the semantics of a sentence. The third layer provides a calculation of the absolute difference and dot product between the vectors of the 2 sentences. This is followed by the fully-connected layers to produce the final similarity prediction.</p>
        </sec>
        <sec>
          <title>Sentence Embedding Model (BioSentVec Model)</title>
          <p>We used the model from [<xref ref-type="bibr" rid="ref29">29</xref>], which achieved the highest performance on MEDSTS for the post–BioCreative/OHNLP challenge task [<xref ref-type="bibr" rid="ref20">20</xref>]. The model structure is similar to the CNN model above, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>B. The primary difference is that this model uses BioSentVec to directly produce the sentence vectors. Therefore, there are no convolutional or pooling layers.</p>
        </sec>
        <sec>
          <title>Transformer-Based Model (BioBERT, BlueBERT, and ClinicalBERT Models)</title>
          <p>This model structure is illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>C. First, the sentences were concatenated as one input (as recommended by the authors of BERT [<xref ref-type="bibr" rid="ref22">22</xref>]), followed by a BERT module and fully-connected layers. We benchmarked 3 different BERT modules: (1) BioBERT [<xref ref-type="bibr" rid="ref24">24</xref>], pretrained on PubMed abstracts and PubMed Central full-text articles; (2) BlueBERT [<xref ref-type="bibr" rid="ref25">25</xref>], pretrained on PubMed abstracts and Medical Information Mart for Intensive Care-III clinical notes; and (3) ClinicalBERT [<xref ref-type="bibr" rid="ref26">26</xref>], pretrained on clinical notes using the weights from BioBERT.</p>
        </sec>
        <sec>
          <title>Additional Machine Learning Baseline Model (Random Forest)</title>
          <p>Although the top-performing submissions used deep learning–based models [<xref ref-type="bibr" rid="ref30">30</xref>], it is also critical to compare with traditional machine learning–based models to better understand the effectiveness and efficiency of deep learning–based models. Therefore, we evaluated the performance of a classic machine learning model as an additional baseline. Specifically, we adapted the random forest model, which achieved the best performance out of 13 submissions in the 2018 BioCreative/OHNLP challenge task [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. This model uses manually engineered features in 5 dimensions to capture sentence similarity: token-based, character-based, sequence-based, semantic-based, and entity-based. We performed feature selection based on the performance of the validation set and ultimately selected 13 features.</p>
        </sec>
      </sec>
      <sec>
        <title>Data Set, Evaluation Metric, and Hyperparameter Tuning</title>
        <p>The details of the data set are presented in the data description studies [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. In short, the data set consists of 2054 sentence pairs, with the similarity annotated on a scale of 0 to 5: (1) 0, if the 2 sentences are entirely dissimilar; (2) 1, if the 2 sentences are dissimilar but have the same topic; (3) 2, if the 2 sentences are not equivalent but share some details; (4) 3, if the 2 sentences are roughly equivalent but some important information is different; (5) 4, if the 2 sentences are mostly equivalent and only minor details differ; and (6) 5, if the 2 sentences are semantically equivalent [<xref ref-type="bibr" rid="ref7">7</xref>]. The data set was annotated by 2 medical experts, with a weighted Cohen <italic>κ</italic> of 0.60 as the interannotator agreement measure [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
        <p>The training and testing sets were officially released by the task organizers and consisted of 1642 and 429 sentence pairs, respectively. We randomly sampled approximately 20% of the sentence pairs (329 pairs) from the training set as the validation set. The Pearson correlation coefficient was used as the official evaluation metric.</p>
        <p>Given that the models have different architectures and hyperparameters, we performed hyperparameter tuning for the CNN, BioSentVec, and BERT models separately, rather than using the same values. The values of the hyperparameters are listed in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Hyperparameters of the sentence similarity models. Common hyperparameters are shared among all of the models. In contrast, model-specific hyperparameters are only for specific models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="340"/>
            <col width="0"/>
            <col width="220"/>
            <col width="0"/>
            <col width="240"/>
            <col width="0"/>
            <col width="170"/>
            <thead>
              <tr valign="bottom">
                <td colspan="3">Hyperparameters</td>
                <td colspan="2">CNN<sup>a</sup></td>
                <td colspan="2">BioSentVec</td>
                <td>BERT<sup>b</sup> variation</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="8">
                  <bold>Common hyperparameters</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FC<sup>c</sup> layers</td>
                <td colspan="2">128</td>
                <td colspan="2">512, 256, 128, 32</td>
                <td colspan="2">128, 32</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Dropout</td>
                <td colspan="2">0.5</td>
                <td colspan="2">0.5</td>
                <td colspan="2">0.5</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Optimizer</td>
                <td colspan="2">Adam</td>
                <td colspan="2">SGD<sup>d</sup></td>
                <td colspan="2">AdamWarmup</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Learning rate</td>
                <td colspan="2">1e-3</td>
                <td colspan="2">5e-3</td>
                <td colspan="2">2e-5</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Batch size</td>
                <td colspan="2">64</td>
                <td colspan="2">16</td>
                <td colspan="2">32</td>
              </tr>
              <tr valign="top">
                <td colspan="8">
                  <bold>Specific hyperparameters</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Maximum length</td>
                <td colspan="2">170</td>
                <td colspan="2">N/A<sup>e</sup></td>
                <td colspan="2">128</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Conv<sup>f</sup></td>
                <td colspan="2">1800</td>
                <td colspan="2">N/A</td>
                <td colspan="2">N/A</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Pooling</td>
                <td colspan="2">Maximum</td>
                <td colspan="2">N/A</td>
                <td colspan="2">Maximum</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>CNN: Convolutional Neural Network.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>FC: fully-connected.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>SGD: stochastic gradient descent</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>N/A: not applicable.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>Conv: convolutional layers.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Evaluation Methods</title>
        <p>We measured the Pearson correlation (for effectiveness) and the running time in seconds (for efficiency) on the testing set. To compare the 5 models quantitatively, we repeated the experiments 10 times on the same training, validation, and testing sets and reported the results of Wilcoxon rank-sum test on the average Pearson correlation and running time at 95% CI. We chose the same evaluation metric and statistical test as the task organizers for consistency [<xref ref-type="bibr" rid="ref9">9</xref>]. We further evaluated the Spearman correlation, R², and mean square error as additional metrics for effectiveness.</p>
        <p>In practice, the running time can be significantly affected by the computing environment rather than the model architecture. For instance, GPUs could significantly boost the inference time; however, many sentence search servers (especially research tools) may not have GPUs available. Different multi-processing methods may have an impact on the running time as well. For a fair comparison, we used a single processor on the central processing unit for model inference on the testing set and tracked the running time accordingly.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Effectiveness and Efficiency Results</title>
        <p><xref ref-type="table" rid="table2">Table 2</xref> presents the effectiveness and efficiency results. All 5 deep learning models had reasonable and very close effectiveness results for this data set. The difference between the average Pearson correlation was within 3%. The BioSentVec model achieved the highest Pearson correlation (0.8497), followed by BioBERT (0.8481; <italic>P</italic>=.74). The deep learning models had approximately 15% higher Pearson correlation than the baseline random forest model. In addition, the results demonstrate that a single deep learning model can achieve a maximum Pearson correlation score of 0.87. We further developed a model by averaging the predictions of the 4 best models. The ensemble model further improved the score by close to 0.90. This observation is consistent with our submission results. <xref ref-type="table" rid="table3">Table 3</xref> provides additional effectiveness measures. BioSentVec consistently showed the highest performance in 3 out of 4 metrics, followed by BioBERT.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Effectiveness and efficiency results for the official test set. The models are ranked by the mean effectiveness results in descending order. The <italic>P</italic> value of the Wilcoxon rank-sum test at a 95% CI is shown for each model compared with the model with the highest effectiveness or efficiency results. The results of the ensemble model also are provided; however, this study focuses on single models in terms of, for example, their robustness to sentence pairs of different similarity levels and their inference time for production purposes.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="210"/>
            <col width="0"/>
            <col width="160"/>
            <col width="0"/>
            <col width="80"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="0"/>
            <col width="160"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Model</td>
                <td colspan="7"> Effectiveness (Pearson correlation)</td>
                <td colspan="5">Efficiency (seconds)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">Values, mean (SD)</td>
                <td colspan="2"><italic>P</italic> value</td>
                <td colspan="2">Maximum<break/>effectiveness</td>
                <td colspan="3">Values, mean (SD)</td>
                <td colspan="2"><italic>P</italic> value</td>
                <td>Lowest efficiency</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="15">
                  <bold>Five benchmarking models</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BioSentVec</td>
                <td colspan="2">0.8497 (0.0099)</td>
                <td colspan="2">N/A<sup>a</sup></td>
                <td colspan="2">0.8654</td>
                <td colspan="3">1.48 (0.23)</td>
                <td colspan="2">N/A</td>
                <td colspan="2">1.96</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BioBERT</td>
                <td colspan="2">0.8481 (0.0122)</td>
                <td colspan="2">.74</td>
                <td colspan="2">0.8698</td>
                <td colspan="3">85.05 (4.93)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">95.66</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ClinicalBERT</td>
                <td colspan="2">0.8442 (0.0161)</td>
                <td colspan="2">.39</td>
                <td colspan="2">0.8677</td>
                <td colspan="3">85.20 (4.74)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">95.21</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BlueBERT</td>
                <td colspan="2">0.8320 (0.0232)</td>
                <td colspan="2">.02</td>
                <td colspan="2">0.8613</td>
                <td colspan="3">84.81 (1.63)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">88.22</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>CNN<sup>b</sup></td>
                <td colspan="2">0.8224 (0.0043)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">0.8307</td>
                <td colspan="3">4.35 (0.27)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">4.97</td>
              </tr>
              <tr valign="top">
                <td colspan="15">
                  <bold>Additional machine learning baseline model</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Random forest</td>
                <td colspan="2">0.6848 (0.0022)</td>
                <td colspan="2">N/A</td>
                <td colspan="2">N/A</td>
                <td colspan="3">0.03 (0.00)</td>
                <td colspan="2">.99</td>
                <td colspan="2">0.03</td>
              </tr>
              <tr valign="top">
                <td colspan="15">
                  <bold>Ensembled model</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ensemble model</td>
                <td colspan="2">0.8782</td>
                <td colspan="2">N/A</td>
                <td colspan="2">0.8940</td>
                <td colspan="3">N/A</td>
                <td colspan="2">N/A</td>
                <td colspan="2"> N/A</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>N/A: not applicable.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>CNN: Convolutional Neural Network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Additional effectiveness results of individual models. The models are ranked by the Pearson correlation coefficient in descending order.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="220"/>
            <col width="0"/>
            <col width="200"/>
            <col width="0"/>
            <col width="210"/>
            <col width="0"/>
            <col width="190"/>
            <col width="0"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Model</td>
                <td colspan="7">Values, mean (SD)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">Pearson correlation</td>
                <td colspan="2">Spearman correlation</td>
                <td colspan="2">R²<sup>a</sup></td>
                <td>MSE<sup>b</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="10">
                  <bold>Five benchmarking models</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BioSentVec</td>
                <td colspan="2">0.8497 (0.0099)</td>
                <td colspan="2">0.7708 (0.0073)</td>
                <td colspan="2">0.6705 (0.0325)</td>
                <td colspan="2">0.8709 (0.0434)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BioBERT</td>
                <td colspan="2">0.8481 (0.0122)</td>
                <td colspan="2">0.7951 (0.0100)</td>
                <td colspan="2">0.6636 (0.0275)</td>
                <td colspan="2">0.8803 (0.0362)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ClinicalBERT</td>
                <td colspan="2">0.8442 (0.0161)</td>
                <td colspan="2">0.8066 (0.0149)</td>
                <td colspan="2">0.6357 (0.0391)</td>
                <td colspan="2">0.9155 (0.0502)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BlueBERT</td>
                <td colspan="2">0.8320 (0.0232)</td>
                <td colspan="2">0.7701 (0.0244)</td>
                <td colspan="2">0.6520 (0.0544)</td>
                <td colspan="2">0.8935 (0.0670)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>CNN<sup>c</sup></td>
                <td colspan="2">0.8224 (0.0043)</td>
                <td colspan="2">0.7674 (0.0087)</td>
                <td colspan="2">0.6136 (0.0436)</td>
                <td colspan="2">0.9428 (0.0519)</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Additional machine learning baseline model</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Random forest</td>
                <td colspan="2">0.6848 (0.0022)</td>
                <td colspan="2">0.6572 (0.0027)</td>
                <td colspan="2">0.4154 (0.0025)</td>
                <td colspan="2">1.1614 (0.0025)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>R<sup>2</sup>: coefficient of determination.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>MSE: mean square error.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>CNN: Convolutional Neural Network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>In contrast to the effectiveness results, the efficiency results differed dramatically among the models. As shown in <xref ref-type="table" rid="table1">Table 1</xref>, it took about 1.5 seconds, on average, for the BioSentVec model to predict the similarities of 429 sentence pairs in the testing set; the counterpart of the CNN model took about 4.5 seconds, on average. In contrast, all BERT models require more than 80 seconds, on average, for inference.</p>
      </sec>
      <sec>
        <title>Error Analysis</title>
        <p>We further analyzed the common errors made by the models. <xref rid="figure2" ref-type="fig">Figure 2</xref> shows the quantitative evaluations. We categorized the sentences into 5 groups based on the annotation guidelines and measured the MSE between the gold standard and predictions. Note that we did not use Pearson correlations as they are heavily influenced by the limited number of instances in small categories [<xref ref-type="bibr" rid="ref20">20</xref>]. MSE is thus used as an alternative metric, which has also been used as a loss function for many deep learning models for regression-based applications.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Mean squared error (MSE) of the models for each similarity range. Each category shows the number of sentence pairs and associated MSE of the models. The overall MSE (median, SD) are also provided in the legend. CNN: Convolutional Neural Network.</p>
          </caption>
          <graphic xlink:href="medinform_v9i12e27386_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> shows 2 primary observations. First, the random forest model had the highest MSE for the pairs with similarity scores between 0 and 1; the error rate was almost twice that of the deep learning models. In contrast, the MSEs of the random forest in other similarity categories were much smaller. This suggests that the random forest model may not effectively identify sentence pairs of low similarity. We manually examined the sentence pairs of low similarity and provided representative examples where the random forest model had a larger MSE than the other models, along with the predictions of BioBERT and BioSentVec for comparison (<xref ref-type="table" rid="table4">Table 4</xref>). The errors shared consistent patterns where (1) the sentence structure was similar (eg, both started with “The patient...”), (2) the pairs shared many common or similar words (eg, case 4 shares “examined and normal”), and (3) the semantics of the pairs were rather different. In such cases, the random forest model failed to capture the semantics at the sentence level. In addition, cases 1-3 had the gold standard annotation score of 0, whereas the similar case 5 had the counterpart of 1. One may argue that the drugs in case 5 are rather different, and the procedure was independent and could have a score of 0; alternatively, given the score of case 5, cases 1-3 could arguably have the same score as well because they were all related to patient status (similarly, both BioSentVec and BioBERT provided consistent scores on these cases). This is also consistent with the findings of the task organizers [<xref ref-type="bibr" rid="ref9">9</xref>], which demonstrated that annotating the sentence similarity is a challenging task as relatedness is context-dependent.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Qualitative examples with a relatively large mean squared error for the random forest model for sentence pair scores from 0.0 to 1.0.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="50"/>
            <col width="530"/>
            <col width="110"/>
            <col width="110"/>
            <col width="90"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td>Case</td>
                <td>Sentence pairs</td>
                <td>Gold standard</td>
                <td>Random forest</td>
                <td>BioBERT</td>
                <td>BioSentVec</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The patient tolerated the procedure well and was transferred to the recovery room in stable condition.</p>
                    </list-item>
                    <list-item>
                      <p>The patient was transferred to the patient appointment coordinator for an appointment to be scheduled within the timeframe advised.</p>
                    </list-item>
                  </list>
                </td>
                <td>0.0</td>
                <td>2.5</td>
                <td>0.5</td>
                <td>1.2</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Patient to call to schedule additional treatment sessions as needed otherwise patient dismissed from therapy.</p>
                    </list-item>
                    <list-item>
                      <p>Patient tolerated session without adverse reactions to therapy.</p>
                    </list-item>
                  </list>
                </td>
                <td>0.0</td>
                <td>3.4</td>
                <td>1.4</td>
                <td>1.4</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Patient was agreeable to speaking with social work.</p>
                    </list-item>
                    <list-item>
                      <p>Patient was able to teach back concepts discussed.</p>
                    </list-item>
                  </list>
                </td>
                <td>0.0</td>
                <td>2.0</td>
                <td>1.7</td>
                <td>1.7</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Left upper extremity: Inspection, palpation examined and normal.</p>
                    </list-item>
                    <list-item>
                      <p>Abdomen: Liver and spleen, bowel sounds examined and normal.</p>
                    </list-item>
                  </list>
                </td>
                <td>0.5</td>
                <td>2.4</td>
                <td>2.1</td>
                <td>1.1</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>glucosamine capsule 1 capsule by mouth one time daily.</p>
                    </list-item>
                    <list-item>
                      <p>Claritin tablet 1 tablet by mouth one time daily.</p>
                    </list-item>
                  </list>
                </td>
                <td>1.0</td>
                <td>2.6</td>
                <td>1.7</td>
                <td>1.5</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>Second, all the deep learning models, except the CNN model, showed reasonable performance for the pairs with similarity scores between 1 and 4. The MSE was mainly within 1, suggesting that the predictions were likely in the same category as the gold standard. However, the BERT models had a much higher MSE for the pairs with scores from 4 to 5. For example, ClinicalBERT had an MSE of over 2.5, whereas the counterparts of both CNN and BioSentVec were lower than 1. Similarly, the variance of BERT models on sentence pairs with similarity scores from 4 to 5 was also larger than that of the other models. <xref ref-type="table" rid="table5">Table 5</xref> shows the representative sentence pairs for which ClinicalBERT had a larger MSE than the other models, along with the predictions of BioBERT and BioSentVec for comparison. The examples indicated that ClinicalBERT could not capture highly similar sentence pairs when there are different negation terms (eg, case 1) or when the word order is switched (eg, case 2) as compared with BioBERT and BioSentVec. Similarly, interannotator consistency may also have an impact on MSE. For example, sentence pairs from cases 4 and 5 arguably belong to the same category, as the pairs share the majority of information, except for minor differences.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Qualitative examples with a relatively large mean squared error for Bidirectional Encoder Representations from Transformers models for sentence pair scores from 4.0 to 5.0.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="50"/>
            <col width="530"/>
            <col width="110"/>
            <col width="110"/>
            <col width="90"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td>Case</td>
                <td>Sentence pairs</td>
                <td>Gold standard</td>
                <td>ClinicalBERT</td>
                <td>BioBERT</td>
                <td>BioSentVec</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Heart: S1/S2 regular rate and rhythm, without murmurs, gallops, or rubs</p>
                    </list-item>
                    <list-item>
                      <p>Heart: S1, S2, regular rate and rhythm, no abnormal heart sounds or murmur</p>
                    </list-item>
                  </list>
                </td>
                <td>5.0</td>
                <td>2.5</td>
                <td>3.4</td>
                <td>3.9</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>He denies chest pain or shortness of breath</p>
                    </list-item>
                    <list-item>
                      <p>He denies shortness of breath or chest pain</p>
                    </list-item>
                  </list>
                </td>
                <td>5.0</td>
                <td>2.3</td>
                <td>3.3</td>
                <td>3.9</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>This patient benefits from skilled occupational and/or physical therapy to improve participation in daily occupations</p>
                    </list-item>
                    <list-item>
                      <p>Medical necessity: the patient would benefit from skilled physical therapy interventions to be able to return to work and engage in self-care activities</p>
                    </list-item>
                  </list>
                </td>
                <td>4.0</td>
                <td>2.4</td>
                <td>2.2</td>
                <td>2.5</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>All questions were answered to the parent’s satisfaction</p>
                    </list-item>
                    <list-item>
                      <p>All questions were answered and consent was given to proceed</p>
                    </list-item>
                  </list>
                </td>
                <td>4.0</td>
                <td>2.8</td>
                <td>2.6</td>
                <td>3.7</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The patient understands and is happy with the plan</p>
                    </list-item>
                    <list-item>
                      <p>The patient verbalized understanding and wishes to proceed</p>
                    </list-item>
                  </list>
                </td>
                <td>5.0</td>
                <td>3.0</td>
                <td>2.9</td>
                <td>3.6</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study has 2 primary findings. First, the effectiveness of deep learning models on this data set is high (all 5 models have a Pearson correlation of over 0.8, which is approximately 15% higher than that of the traditional machine learning model) and relatively close (the Pearson correlation difference is within 0.03 among the models), but their efficiency is significantly different. BERT models are, on average, 20-50 times slower than the CNN and BioSentVec models, respectively.</p>
        <p>The dramatically different efficiency results lead to the concern of using STS models in real-world applications in the biomedical and clinical domains. To demonstrate this, we further quantified the number of sentence pairs that could be computed in real-time based on the sentence search pipeline in LitSense [<xref ref-type="bibr" rid="ref2">2</xref>]. LitSense is a web server for searching for relevant sentences from approximately 30 million PubMed abstracts and approximately 3 million PubMed Central full-text articles. To find relevant sentences for a query, it uses the standard BM25 to retrieve top candidates and then reranks the candidates using deep learning models. The rerank stage in LitSense is allocated for 300 ms based on evaluations of the developers. Using 300 ms as the threshold, BERT models can rerank only 2 pairs in real-time, whereas the CNN and BioSentVec models can rerank approximately 30 and 87 pairs, respectively. It should be noted that the results here are for demonstration purposes. In practice, as mentioned above, many factors could impact the inference time, such as GPUs and efficient multi-processing procedures. The real inference time might differ, but the difference between the models holds, as we fairly compared all of the models in the same setting. On the basis of these results, we suggest using compressed or distilled BERT models [<xref ref-type="bibr" rid="ref31">31</xref>] for real-time applications, especially when production servers do not have available GPUs.</p>
        <p>The second primary finding is that the random forest model made more errors in sentence pairs of low similarity (similarity scores from 0 to 1), whereas BERT models made more errors on highly similar sentence pairs (similarity scores from 4 to 5). The random forest model cannot effectively capture the sentence semantics when a sentence pair shares consistent structures and similar words but distinct topics. In contrast, ClinicalBERT had an MSE of over 2.5 for highly similar sentence pairs, especially when different negation terms or the word order is switched. As mentioned above, the results also suggest that interannotator consistency may also impact MSE, showing the difficulty of relatedness-based tasks.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>The main limitation of this study is that the analysis was conducted using the ClinicalSTS data set alone. To the best of our knowledge, the data set is already the largest available sentence similarity data set in this domain. Other data sets, such as BOSSES, are much smaller. We believe that it is critical to developing more sentence similarity data sets from other sources in the biomedical and clinical domains, which could expand our analysis and further improve the existing methods.</p>
        <p>Another limitation is that the ClinicalSTS data set lacked user-level evaluations. The notion of relevance is context-dependent: sentence pairs with high similarity scores predicted by the models may not necessarily be considered relevant by users [<xref ref-type="bibr" rid="ref32">32</xref>]. Previous studies demonstrated that the top sentences ranked by the top STS models were not the most relevant to users based on manual judgment [<xref ref-type="bibr" rid="ref33">33</xref>]. Therefore, it is critical to conduct user-level assessments to understand whether STS models can facilitate information retrieval in practice, in addition to understanding the effectiveness and efficiency measures. We consider this as future work.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>Most existing studies focus on developing innovative methods to improve correlations in the testing set. Top-ranked methods are summarized in the overview papers on clinical STS challenge tasks [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], from traditional machine learning methods [<xref ref-type="bibr" rid="ref30">30</xref>] to word and sentence embedding–based methods [<xref ref-type="bibr" rid="ref20">20</xref>] and transformer-based methods [<xref ref-type="bibr" rid="ref24">24</xref>]. Other studies further used advanced learning methods, such as representation fusion [<xref ref-type="bibr" rid="ref34">34</xref>] and multitask learning [<xref ref-type="bibr" rid="ref27">27</xref>]. The reported Pearson correlations range from 0.83 to 0.90, which is consistent with our study. Although it is exciting to further improve the state-of-the-art results, it is more critical to understand the effectiveness and efficiency of these models in depth, especially when the human-level correlation level is only moderate in these data sets.</p>
        <p>Only 2 studies have compared the effectiveness of STS models in the biomedical and clinical domains [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Tawfik et al [<xref ref-type="bibr" rid="ref35">35</xref>] compared the performance of a range of embeddings in sentence-based data sets (mostly classification-based applications, not STS) in the biomedical domain. Studies have shown that embeddings pretrained in biomedical and clinical corpora could achieve reasonable Pearson correlation scores, which is consistent with our study. However, these studies focused mainly on the Pearson correlations and did not consider model robustness or efficiency. Arguably, the latter is more critical to using STS models in practice.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this postchallenge study, we comparatively analyzed the effectiveness and efficiency of 5 deep learning models in the ClinicalSTS data set. Although these models achieved high Pearson correlation scores, their robustness varied dramatically in terms of sentence pairs at different similarity levels, and BERT models have significantly longer inference times. In addition, the models achieved Pearson correlations of approximately 0.90 in this data set, whereas the human-level agreement was only moderate. Taken together, these observations make us cautious about the further improvement of this data set and argue for a more thorough evaluation of the model-generalization capability and user-level testing. We also call for community efforts to create more STS data sets from different perspectives to reflect the multifaceted notion of sentence relatedness, which will further improve the generalization performance of deep learning models.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CNN</term>
          <def>
            <p>Convolutional Neural Network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GPU</term>
          <def>
            <p>graphics processing unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">n2c2</term>
          <def>
            <p>National Natural Language Processing Clinical Challenges</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">OHNLP</term>
          <def>
            <p>Open Health Natural Language Processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">STS</term>
          <def>
            <p>semantic textual similarity</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This research was supported by the Intramural Research Program of the National Institutes of Health, National Library of Medicine. This work was also supported by the National Library of Medicine of the National Institutes of Health under award 4R00LM013001. The authors thank Dr. Alexis Allot for helpful discussions on the sentence search pipeline in LitSense. They also thank the National Natural Language Processing Clinical Challenges/Open Health Natural Language Processing Consortium challenge task organizers for coordinating this shared task.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>All authors are employees of the National Institutes of Health.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Prakoso</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Abdi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Amrit</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Short text similarity measurement methods: a review</article-title>
          <source>Soft Comput</source>
          <year>2021</year>
          <month>01</month>
          <day>03</day>
          <volume>25</volume>
          <issue>6</issue>
          <fpage>4699</fpage>
          <lpage>723</lpage>
          <pub-id pub-id-type="doi">10.1007/s00500-020-05479-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Allot</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Alvarez</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Comeau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wilbur</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Litsense: Making sense of biomedical literature at sentence level</article-title>
          <source>Nucleic Acids Res</source>
          <year>2019</year>
          <month>07</month>
          <day>02</day>
          <volume>47</volume>
          <issue>W1</issue>
          <fpage>594</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31020319"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkz289</pub-id>
          <pub-id pub-id-type="medline">31020319</pub-id>
          <pub-id pub-id-type="pii">5479473</pub-id>
          <pub-id pub-id-type="pmcid">PMC6602490</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>International Society for Biocuration</collab>
          </person-group>
          <article-title>Biocuration: Distilling data into knowledge</article-title>
          <source>PLoS Biol</source>
          <year>2018</year>
          <month>04</month>
          <day>16</day>
          <volume>16</volume>
          <issue>4</issue>
          <fpage>8</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pbio.2002846"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pbio.2002846</pub-id>
          <pub-id pub-id-type="medline">29659566</pub-id>
          <pub-id pub-id-type="pii">pbio.2002846</pub-id>
          <pub-id pub-id-type="pmcid">PMC5919672</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Leaman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Allot</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in action: Addressing the covid-19 pandemic with natural language processing</article-title>
          <source>Annu Rev Biomed Data Sci</source>
          <year>2021</year>
          <month>05</month>
          <day>14</day>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>313</fpage>
          <lpage>39</lpage>
          <pub-id pub-id-type="doi">10.1146/annurev-biodatasci-021821-061045</pub-id>
          <pub-id pub-id-type="medline">34465169</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Semeval-2017 Task 1: Semantic textual similarity multilingual and crosslingual focused evaluation</article-title>
          <source>Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)</source>
          <year>2017</year>
          <conf-name>11th International Workshop on Semantic Evaluation (SemEval-2017)</conf-name>
          <conf-date>August, 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>1</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/s17-2001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sogancioglu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Öztürk</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Özgür</surname>
              <given-names>AB</given-names>
            </name>
          </person-group>
          <article-title>Biosses: A semantic sentence similarity estimation system for the biomedical domain</article-title>
          <source>Bioinform</source>
          <year>2017</year>
          <month>07</month>
          <day>15</day>
          <volume>33</volume>
          <issue>14</issue>
          <fpage>49</fpage>
          <lpage>58</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28881973"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btx238</pub-id>
          <pub-id pub-id-type="medline">28881973</pub-id>
          <pub-id pub-id-type="pii">3953954</pub-id>
          <pub-id pub-id-type="pmcid">PMC5870675</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Medsts: A resource for clinical semantic textual similarity</article-title>
          <source>Lang Resour Eval</source>
          <year>2018</year>
          <month>10</month>
          <day>24</day>
          <volume>54</volume>
          <issue>1</issue>
          <fpage>57</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.1007/s10579-018-9431-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Overview of the BioCreative/OHNLP challenge 2018 task 2: Clinical semantic textual similarity</article-title>
          <source>Proceedings of the 2018 ACM International Conference on Bioinformatics, Computational Biology, and Health Informatics</source>
          <year>2018</year>
          <conf-name>BCB '18: 9th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics</conf-name>
          <conf-date>August 29 - September 1, 2018</conf-date>
          <conf-loc>Washington DC USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/327424883_Overview_of_BioCreativeOHNLP_Challenge_2018_Task_2_Clinical_Semantic_Textual_Similarity"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The 2019 n2c2/ohnlp track on clinical semantic textual similarity: overview</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>11</month>
          <day>27</day>
          <volume>8</volume>
          <issue>11</issue>
          <fpage>11</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/11/e23375/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/23375</pub-id>
          <pub-id pub-id-type="medline">33245291</pub-id>
          <pub-id pub-id-type="pii">v8i11e23375</pub-id>
          <pub-id pub-id-type="pmcid">PMC7732706</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schober</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Boer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schwarte</surname>
              <given-names>LA</given-names>
            </name>
          </person-group>
          <article-title>Correlation coefficients</article-title>
          <source>Anesth Analg</source>
          <year>2018</year>
          <volume>126</volume>
          <issue>5</issue>
          <fpage>1763</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1213/ane.0000000000002864</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>arXiv.org</source>
          <year>2013</year>
          <access-date>2021-09-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1310.4546">https://arxiv.org/abs/1310.4546</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Glove: Global vectors for word representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>
          <year>2014</year>
          <conf-name>2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>October, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Enriching word vectors with subword information</article-title>
          <source>Trans Assoc Comput Linguist</source>
          <year>2017</year>
          <month>12</month>
          <volume>5</volume>
          <fpage>135</fpage>
          <lpage>46</lpage>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Biowordvec, improving biomedical word embeddings with subword information and mesh</article-title>
          <source>Sci Data</source>
          <year>2019</year>
          <month>05</month>
          <day>10</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>52</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41597-019-0055-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41597-019-0055-0</pub-id>
          <pub-id pub-id-type="medline">31076572</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-019-0055-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6510737</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kingsbury</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A comparison of word embeddings for the biomedical natural language processing</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>11</month>
          <volume>87</volume>
          <fpage>12</fpage>
          <lpage>20</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(18)30182-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.09.008</pub-id>
          <pub-id pub-id-type="medline">30217670</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30182-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC6585427</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Bioconceptvec: Creating and evaluating literature-based biomedical concept embeddings on a large scale</article-title>
          <source>PLoS Comput Biol</source>
          <year>2020</year>
          <month>04</month>
          <day>23</day>
          <volume>16</volume>
          <issue>4</issue>
          <fpage>18</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pcbi.1007617"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1007617</pub-id>
          <pub-id pub-id-type="medline">32324731</pub-id>
          <pub-id pub-id-type="pii">PCOMPBIOL-D-19-00927</pub-id>
          <pub-id pub-id-type="pmcid">PMC7237030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of sentences and documents</article-title>
          <source>Proceedings of the 31st International Conference on Machine Learning</source>
          <year>2014</year>
          <conf-name>31st International Conference on Machine Learning</conf-name>
          <conf-date>2014</conf-date>
          <conf-loc>Bejing, China</conf-loc>
          <fpage>1188</fpage>
          <lpage>96</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.mlr.press/v32/le14.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hill</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Korhonen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Learning distributed representations of sentences from unlabelled data</article-title>
          <source>Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2016</year>
          <conf-name>2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June, 2016</conf-date>
          <conf-loc>San Diego, California</conf-loc>
          <fpage>1367</fpage>
          <lpage>77</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/n16-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pagliardini</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Jaggi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised learning of sentence embeddings using compositional n-gram features</article-title>
          <source>arXiv.org</source>
          <year>2017</year>
          <month>03</month>
          <day>07</day>
          <access-date>2021-09-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1703.02507">https://arxiv.org/abs/1703.02507</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wilbur</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Deep learning with sentence embeddings pre-trained on biomedical corpora improves the performance of finding similar sentences in electronic medical records</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2020</year>
          <month>04</month>
          <day>30</day>
          <volume>20</volume>
          <issue>Suppl 1</issue>
          <fpage>73</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1044-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-020-1044-0</pub-id>
          <pub-id pub-id-type="medline">32349758</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-020-1044-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC7191680</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <source>Advances in Neural Information Processing Systems (NIPS)</source>
          <year>2017</year>
          <access-date>2021-09-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aminer.org/pub/5c8b4d794895d9cbc67f0e33/attention-is-all-you-need">https://www.aminer.org/pub/5c8b4d794895d9cbc67f0e33/attention-is-all-you-need</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2019</year>
          <conf-name>17th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2019)</conf-name>
          <conf-date>Jun 2 - 7, 2019</conf-date>
          <conf-loc>Minneapolis, Minnesota</conf-loc>
          <fpage>4171</fpage>
          <lpage>86</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Improving language understanding by generative pre-training</article-title>
          <source>Open AI Codex</source>
          <year>2018</year>
          <month>06</month>
          <day>11</day>
          <access-date>2018-06-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/blog/language-unsupervised/">https://openai.com/blog/language-unsupervised/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Biobert: A pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinform</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>40</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Transfer learning in biomedical natural language processing: An evaluation of bert and elmo on ten benchmarking datasets</article-title>
          <source>Proceedings of the 18th BioNLP Workshop and Shared Task</source>
          <year>2019</year>
          <conf-name>18th BioNLP Workshop and Shared Task</conf-name>
          <conf-date>August, 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <fpage>58</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w19-5006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Publicly available clinical BERT embeddings</article-title>
          <source>arXiv.org</source>
          <year>2019</year>
          <access-date>2021-09-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1904.03323">https://arxiv.org/abs/1904.03323</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>An empirical study of multi-task learning on BERT for biomedical text mining</article-title>
          <source>Proceedings of the 19th SIGBioMed Workshop on Biomedical Language Processing</source>
          <year>2020</year>
          <month>05</month>
          <day>06</day>
          <conf-name>19th SIGBioMed Workshop on Biomedical Language Processing</conf-name>
          <conf-date>July, 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <fpage>205</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2020.bionlp-1.22</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>HCTI at SemEval-2017 Task 1: Use convolutional neural network to evaluate semantic textual similarity</article-title>
          <source>Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)</source>
          <year>2017</year>
          <month>08</month>
          <conf-name>11th International Workshop on Semantic Evaluation (SemEval-2017)</conf-name>
          <conf-date>August, 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>130</fpage>
          <lpage>3</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/s17-2016</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>BioSentVec: Creating sentence embeddings for biomedical texts</article-title>
          <source>Proceedings of the IEEE International Conference on Healthcare Informatics (ICHI)</source>
          <year>2019</year>
          <conf-name>The Seventh IEEE International Conference on Healthcare Informatics (ICHI 2019)</conf-name>
          <conf-date>June 10-13, 2019</conf-date>
          <conf-loc>Xi'an, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/ichi.2019.8904728</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wilbur</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Combining rich features and deep learning for finding similar sentences in electronic medical records</article-title>
          <source>Proceedings of the BioCreative/OHNLP Challenge</source>
          <year>2018</year>
          <conf-name>BioCreative/OHNLP Challenge</conf-name>
          <conf-date>August 29 -  September 1, 2018</conf-date>
          <conf-loc>Washington DC, USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/327402060_Combining_rich_features_and_deep_learning_for_finding_similar_sentences_in_electronic_medical_records"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Vechtomova</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distilling task-specific knowledge from bert into simple neural networks</article-title>
          <source>arXiv.org</source>
          <year>2019</year>
          <access-date>2021-09-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1903.12136">https://arxiv.org/abs/1903.12136</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saracevic</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>The notion of relevance in information science: everybody knows what relevance is. But, what is it really?</article-title>
          <source>Synthesis Lectures on Information Concepts, Retrieval, and Services</source>
          <year>2016</year>
          <month>09</month>
          <day>06</day>
          <publisher-loc>Williston, United States</publisher-loc>
          <publisher-name>Morgan &#38; Claypool</publisher-name>
          <fpage>i</fpage>
          <lpage>109</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wilbur</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Sentence similarity measures revisited: Ranking sentences in pubmed documents</article-title>
          <source>Proceedings of the 2018 ACM International Conference on Bioinformatics, Computational Biology, and Health Informatics</source>
          <year>2018</year>
          <conf-name>BCB '18: 9th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics</conf-name>
          <conf-date>August 29 - September 1, 2018</conf-date>
          <conf-loc>Washington DC, United States</conf-loc>
          <publisher-name>Association for Computing Machinery</publisher-name>
          <fpage>531</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1145/3233547.3233640</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Distributed representation and one-hot representation fusion with gated network for clinical semantic textual similarity</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2020</year>
          <month>04</month>
          <day>30</day>
          <volume>20</volume>
          <issue>Suppl 1</issue>
          <fpage>72</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1045-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-020-1045-z</pub-id>
          <pub-id pub-id-type="medline">32349764</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-020-1045-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC7191689</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tawfik</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Spruit</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Evaluating sentence representations for biomedical text: methods and experimental results</article-title>
          <source>J Biomed Inform</source>
          <year>2020</year>
          <month>04</month>
          <volume>104</volume>
          <fpage>103396</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(20)30025-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2020.103396</pub-id>
          <pub-id pub-id-type="medline">32147441</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(20)30025-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Antunes</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Silva</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Matos</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Evaluating semantic textual similarity in clinical sentences using deep learning and sentence embeddings</article-title>
          <source>Proceedings of the 35th Annual ACM Symposium on Applied Computing</source>
          <year>2020</year>
          <conf-name>SAC '20: Proceedings of the 35th Annual ACM Symposium on Applied Computing</conf-name>
          <conf-date>March 30 -  April 3, 2020</conf-date>
          <conf-loc>Brno Czech Republic</conf-loc>
          <fpage>662</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1145/3341105.3373987</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
