<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e23086</article-id>
      <article-id pub-id-type="pmid">33480858</article-id>
      <article-id pub-id-type="doi">10.2196/23086</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>ALBERT-Based Self-Ensemble Model With Semisupervised Learning and Data Augmentation for Clinical Semantic Textual Similarity Calculation: Algorithm Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Wang</surname>
            <given-names>Yanshan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Liu</surname>
            <given-names>Sijia</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wang</surname>
            <given-names>Liwei</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mordaunt</surname>
            <given-names>Dylan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Junyi</given-names>
          </name>
          <degrees>ME</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7162-5396</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Xuejie</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5252-5162</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>Xiaobing</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Information Science and Engineering</institution>
            <institution>Yunnan University</institution>
            <addr-line>East Outer Ring Road</addr-line>
            <addr-line>Chenggong District, Kunming</addr-line>
            <addr-line>Kunming, 650091</addr-line>
            <country>China</country>
            <phone>86 87165031748</phone>
            <email>zhouxb@ynu.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1983-0971</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Information Science and Engineering</institution>
        <institution>Yunnan University</institution>
        <addr-line>Kunming</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Xiaobing Zhou <email>zhouxb@ynu.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>1</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>22</day>
        <month>1</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>1</issue>
      <elocation-id>e23086</elocation-id>
      <history>
        <date date-type="received">
          <day>31</day>
          <month>7</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>22</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>22</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>15</day>
          <month>12</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Junyi Li, Xuejie Zhang, Xiaobing Zhou. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 22.01.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2021/1/e23086/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>In recent years, with increases in the amount of information available and the importance of information screening, increased attention has been paid to the calculation of textual semantic similarity. In the field of medicine, electronic medical records and medical research documents have become important data resources for clinical research. Medical textual semantic similarity calculation has become an urgent problem to be solved.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This research aims to solve 2 problems—(1) when the size of medical data sets is small, leading to insufficient learning with understanding of the models and (2) when information is lost in the process of long-distance propagation, causing the models to be unable to grasp key information.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>This paper combines a text data augmentation method and a self-ensemble ALBERT model under semisupervised learning to perform clinical textual semantic similarity calculations.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Compared with the methods in the 2019 National Natural Language Processing Clinical Challenges Open Health Natural Language Processing shared task Track on Clinical Semantic Textual Similarity, our method surpasses the best result by 2 percentage points and achieves a Pearson correlation coefficient of 0.92.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>When the size of medical data set is small, data augmentation can increase the size of the data set and improved semisupervised learning can boost the learning efficiency of the model. Additionally, self-ensemble methods improve the model performance. Our method had excellent performance and has great potential to improve related medical problems.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>data augmentation</kwd>
        <kwd>semisupervised</kwd>
        <kwd>self-ensemble</kwd>
        <kwd>ALBERT</kwd>
        <kwd>clinical semantic textual similarity</kwd>
        <kwd>algorithm</kwd>
        <kwd>semantic</kwd>
        <kwd>model</kwd>
        <kwd>data sets</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>With the rapid development of computers and artificial intelligence, information availability has begun to show exponential growth. We are already in an era of information explosion. When faced with a large amount of information, time is wasted screening valid information. In addition, a large amount of information is stored in the form of text. Whether involving cluster storage or referring to related information, efficient information matching and screening is crucial. The importance of text information processing research has become very obvious. With major breakthroughs in the research of related algorithms in natural language processing and artificial intelligence, increasingly, research has been devoted to text information processing.</p>
      <p>Textual similarity calculation [<xref ref-type="bibr" rid="ref1">1</xref>] is a key technology for efficient information screening and matching in the field of text processing. Previous work [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref8">8</xref>] has proposed some methods for textual similarity calculation, for example, traditional text similarity calculation methods [<xref ref-type="bibr" rid="ref2">2</xref>], word similarity calculation [<xref ref-type="bibr" rid="ref3">3</xref>], vector space model [<xref ref-type="bibr" rid="ref4">4</xref>], and latent Dirichlet allocation model [<xref ref-type="bibr" rid="ref5">5</xref>]. At present, with the development of deep learning and neural networks, methods based on neural networks have become popular, for example, word vector embedding method [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>] and one-hot representation [<xref ref-type="bibr" rid="ref8">8</xref>]. At the same time, these methods can also be clinically applied.</p>
      <p>In the field of medicine, with the rapid increase in electronic medical data [<xref ref-type="bibr" rid="ref9">9</xref>], electronic medical records and medical documents have become important data resources for medical clinical research. However, most of these data resources are stored unprocessed or in heterogeneous text formats. To understand the content of text data, it is necessary to integrate structured and heterogeneous clinical data resources, medical records, and scientific research documents. Similarity calculation can improve information retrieval performance for medical resources and effectively allow the integration of heterogeneous clinical data. The concept of semantic similarity evaluation is the key to understanding text data resources, which can effectively allow the processing, classification, and structured processing of those resources. For example, a semantic similarity method can be used to semantically analyze patient medical records to identify similar cases and find the best solution.</p>
      <p>However, a large number of publicly available medical data sets are restricted because of privacy, and there are insufficient sources of medical data sets. The scarcity of data sets has led to the slow development of natural language processing (NLP) in the medical field. In recent years, more researchers have begun to pay attention to this issue. Therefore, competitions related to textual semantic similarity calculation have been produced, such as SemEval [<xref ref-type="bibr" rid="ref10">10</xref>], to develop an automated method, and the 2019 National NLP Clinical Challenges (N2C2) Open Health Natural Language Processing (OHNLP) [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>] shared task Track 1 on Clinical Semantic Textual Similarity (STS) [<xref ref-type="bibr" rid="ref13">13</xref>], for systems based on semisupervised learning. An example of clinical STS is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. The score indicates the similarity between the 2 sentences are and fall within an ordinal range, ranging from 0 to 5, where 0 means that the 2 sentences are completely different (ie, their meanings do not overlap) and 5 means that the 2 sentences have complete semantic equivalence.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>An example from the Clinical STS.</p>
        </caption>
        <graphic xlink:href="medinform_v9i1e23086_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>Teams that participated in the 2019 N2C2 OHNLP Clinical STS challenge demonstrated good results with methods such as multitask learning, XLNet, and ClinicalBERT methods. In the challenge, we used recursive neural networks and variants of these neural networks for experiments, such as long short-term memory neural networks [<xref ref-type="bibr" rid="ref14">14</xref>], convolutional neural networks [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>], capsule neural networks [<xref ref-type="bibr" rid="ref17">17</xref>], and ordered long short-term memory neural networks. In addition, we combined some popular deep learning mechanisms, such as attention [<xref ref-type="bibr" rid="ref18">18</xref>] and Siamese [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>] networks. Through comparative experimental research, we obtained a Pearson correlation coefficient of 0.66 [<xref ref-type="bibr" rid="ref21">21</xref>] in the official submission, which was not a satisfying result. Compared with other teams’ methods, our model had 2 drawbacks. First, because the size of clinical data sets was small, there were not enough data to train the model, which led to insufficient learning and understanding of the model. Second, our model was based on a recurrent neural network. Due to the influence of the forget gate in the recurrent neural network, important information may be lost in the process of long-distance propagation, which prevents the model from extracting key information. As a result, the learning efficiency of the model decreased.</p>
      <p>To address the abovementioned problems, this paper proposes a self-ensemble [<xref ref-type="bibr" rid="ref22">22</xref>] ALBERT [<xref ref-type="bibr" rid="ref23">23</xref>] model under semisupervised learning [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] with easy data augmentation (EDA) [<xref ref-type="bibr" rid="ref26">26</xref>] to calculate the semantic similarity of clinical text.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>In this section, we introduce 3 highlights of our method. Our method uses data augmentation and semisupervised learning to expand the scale of the data set from different levels. We pretrained ALBERT (based on self-ensemble methods) to strengthen the acquisition of key information and improve the performance of the model, and semisupervised learning and data augmentation methods were used to expand the number of data sets and increase the representation of data sets, which can prevent self-ensemble methods from overfitting.</p>
      </sec>
      <sec>
        <title>Data Augmentation</title>
        <p>By using external general domain data sets for semisupervised learning, we indirectly solved the problem of insufficient data. However, for medical data, semisupervised learning does not directly increase the amount of medical data. Therefore, we used an EDA method to directly increase the amount of medical data.</p>
        <p>Generally, data augmentation is used in computer vision to flip, zoom, and add noise to a picture. These operations can increase small amounts of data, which can help train a more robust model; however, for text data, data augmentation is mainly used for operations such as replacing, adding, and deleting text. Previous work [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>] has proposed some methods for data augmentation in NLP. For example, a study [<xref ref-type="bibr" rid="ref27">27</xref>] translated sentences into French and then into English to generate new data. Other work has used data noising as smoothing [<xref ref-type="bibr" rid="ref28">28</xref>]. However, these methods are highly time- and resource-consuming thus are not often used in practice.</p>
        <p>In this paper, we use the form of EDA [<xref ref-type="bibr" rid="ref26">26</xref>] shown in <xref ref-type="table" rid="table1">Table 1</xref>. Due to the irreplaceability of proper nouns in medical data, the selection range of the replacement operation has been optimized to keep proper nouns as much as possible. The size of medical data set increased from 1642 to 16,411 after EDA. We can intuitively see a substantial increase in the amount of medical data. We verified that this method increases the size of data set.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Sentences generated using EDA.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <thead>
              <tr valign="top">
                <td>Operation</td>
                <td>Sentence 1</td>
                <td>Sentence 2</td>
                <td>Sentence 3</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>None<sup>a</sup></td>
                <td>oxycodone [ROXICODONE] 5 mg tablet 0.5-1 tablets by mouth every 4 hours as needed.</td>
                <td>A lady is running her cute dog through an agility course.</td>
                <td>A beautiful woman with a young girl pose with bear statues in front of a store.</td>
              </tr>
              <tr valign="top">
                <td>Synonym replacement</td>
                <td>oxycodone [ROXICODONE] 5 mg tablet 0.5-1 tablets by mouth every 4 hours as indeed.</td>
                <td>A lady is running her cute dog through an legerity course.</td>
                <td>A beautiful woman with a young girl pose with bear figurines in front of a store.</td>
              </tr>
              <tr valign="top">
                <td>Random insertion</td>
                <td>oxycodone [ROXICODONE] 5 mg tablet 0.5-1 tablets by every mouth every 4 hours as needed.</td>
                <td>A lady is running her cute dog through an amazing agility course.</td>
                <td>A beautiful woman with a young girl pose with lovely bear statues in front of a store.</td>
              </tr>
              <tr valign="top">
                <td>Random deletion</td>
                <td>oxycodone [ROXICODONE] 5 mg tablet 0.5-1 tablets by mouth every 4 hours.</td>
                <td>A lady is running her dog through an agility course.</td>
                <td>A woman with a young girl pose with bear statues in front of a store.</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>None indicates that this sentence did not undergo any operation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Semisupervised Learning</title>
        <p>Because there was not a sufficient amount of medical data, the training of the model was not complete. To solve this problem, we used the semisupervised learning method in transfer learning.</p>
        <p>The semisupervised [<xref ref-type="bibr" rid="ref29">29</xref>] pretraining task in NLP is a form of transfer learning that aims to establish a wide range of semantic understanding to promote the performance improvement of training and testing tasks. It has been proven that semisupervised pretraining in transfer learning is very effective in benchmark NLP tasks, and the application prospects in medical NLP tasks are particularly broad. Nonspecific pretraining tasks are used for general medical domain tasks; however, commonly used and publicly available data sets are not specific to the medical domain and may not be well summarized. Therefore, the transfer of nonspecific pretraining tasks and the promotion of language models to medical domain tasks are very important for future model development.</p>
        <p>To improve traditional semisupervised learning, we used the <italic>teacher</italic> and <italic>student</italic> idea in data distillation [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>] to improve the design of semisupervised learning. Teacher–student refers to the same training process. The beginning of the student's training is the end of the teacher's training, which can deepen the learning of the model. We used the teacher–student approach to design semisupervised learning. The teacher part uses a data set from the common domain, using the STS-B data set from the General Language Understanding Evaluation standard of the general domain. The student part uses a clinical text data set. Our semisupervised learning method is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Semisupervised learning.</p>
          </caption>
          <graphic xlink:href="medinform_v9i1e23086_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Self-Ensemble ALBERT Model</title>
        <p>ALBERT has been applied to some tasks, such as natural language inference [<xref ref-type="bibr" rid="ref32">32</xref>], sentiment analysis [<xref ref-type="bibr" rid="ref33">33</xref>], causality analysis [<xref ref-type="bibr" rid="ref34">34</xref>], and medical machine reading [<xref ref-type="bibr" rid="ref35">35</xref>]. The self-attention structure is the core part of the transformer mechanism. The self-attention structure can directly calculate the similarity between words, which can intuitively solve the problem of long-distance information dependence. The combined self-attention structure transformer's semantic feature extraction ability is better than those of long short-term memory and convolutional neural networks, and it performs better under the combined action of decomposed embedding parameters and cross-layer shared parameters. Therefore, the pretrained self-attention structure, namely, the pretrained ALBERT model, was applied to our model. ALBERT is a variant of BERT that adds 2 methods of decomposing embedded parameters and sharing parameters across layers. It has 3 improvements. First, ALBERT decomposes embedding, which makes a large number of parameters sparse and reduces the number of dictionaries. Second, ALBERT adopts cross-layer parameter sharing, which reduces the parameter scale and improves the training speed. Third, ALBERT uses intersentence coherence, which makes the model unaffected by specific tasks. The architecture of the ALBERT model is shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Model architecture.</p>
          </caption>
          <graphic xlink:href="medinform_v9i1e23086_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Following ALBERT, we first embedded the input data. Our embedding representation is constructed by the sum of token embedding, segment embedding, and location embedding. The input sequence is <italic>S</italic> = [<italic>s</italic><sub>1</sub>, <italic>s</italic><sub>2</sub>, ..., <italic>s</italic><sub>n</sub>], where <italic>n</italic> is the number of words in the input. The tokens “[CLS]” and “[SEP]” were added at the beginning and end of each instance, respectively.</p>
        <p>Then, we input the data into the ALBERT model, which is made up of <italic>n</italic> transformer stacks,</p>
        <p>
          <graphic xlink:href="medinform_v9i1e23086_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where <italic>S</italic><sub>m</sub> is the output of transformer stack <italic>m</italic>.</p>
        <p>Since the results do not need to be normalized, we did not use an activation function.</p>
        <p>To achieve the best performance, the ALBERT model was fine-tuned. ALBERT models are usually fine-tuned using stochastic gradient descent methods. In fact, fine-tuning the performance of ALBERT is usually sensitive to different random seeds and orders of the training data, especially if the last training sample is noisy. To alleviate this situation, an ensemble method was used to combine multiple fine-tuning models because it can reduce overfitting and improve model generalization. The ensemble ALBERT model usually has better performance than a single ALBERT model. However, training multiple ALBERT models simultaneously is time-consuming. It is often impossible to train multiple models with limited time and GPU resources. Therefore, we improved the model ensemble method to fine-tune the ALBERT model. Our model’s ensemble method is called self-ensemble. The self-ensemble architecture is shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>. The formula for self-ensemble is</p>
        <p>
          <graphic xlink:href="medinform_v9i1e23086_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where ALBERT(<italic>S</italic><sub>k</sub>) represents the checkpoints of the model with <italic>k</italic> training steps.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>(a) Traditional ensemble vs (b) self-ensemble architecture.</p>
          </caption>
          <graphic xlink:href="medinform_v9i1e23086_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Sets</title>
        <p>The Clinical STS shared task data set was collected from electronic health record in the Mayo Clinic clinical data warehouse. Since the Mayo Clinic has completed the system-wide electronic health record conversion of all care locations from General Electric to Epic, the Clinical STS shared task data set will be extracted from the historical General Electric and Epic systems.</p>
        <p>STS-B is a carefully selected English data set used in shared tasks between SemEval and SEM STS between 2012 and 2017. The data was divided into a training set, a development set, and a test set. The development set can be used to design new models and adjust hyperparameters. STS-B can be used to make comparable assessments in different research work and improve the tracking of the latest technology.</p>
        <p><xref ref-type="table" rid="table2">Table 2</xref> shows the size of data set in the Clinical STS data set and the STS-B data set. The STS-B data set was used for the semisupervised learning training model. The STS-B data set comes from a data set collected by the general domain criterion General Language Understanding Evaluation. The Clinical STS data set was used to test the experimental results. The Clinical STS data set was provided by the competition organizer.</p>
        <p>The STS-B data set provides paired text summaries, which are mainly from STS tasks in SemEval obtained over the years. The Clinical STS data set provides pairs of clinical text summaries, which are sentences extracted from clinical notes. This task assigns a numerical score to each pair of sentences to indicate their semantic similarity. <xref ref-type="table" rid="table3">Table 3</xref> shows that the scores fall within an ordinal range, ranging from 0 to 5, where 0 means that the pair of sentences are completely different (ie, their meanings do not overlap) and 5 means that the pair of sentences have complete semantic equivalence.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>The size of data set.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <thead>
              <tr valign="top">
                <td>Data set</td>
                <td>Training</td>
                <td>Validation</td>
                <td>Test</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>STS-B</td>
                <td>5749</td>
                <td>1500</td>
                <td>1379</td>
              </tr>
              <tr valign="bottom">
                <td>Clinical STS</td>
                <td>1642</td>
                <td>N/A<sup>a</sup></td>
                <td>412</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Similarity scores with examples.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="400"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Score</td>
                <td>Sentence 1</td>
                <td>Sentence 2</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>0</td>
                <td>The patient has missed 0 hours of work in the past seven days for issues not related to depression.</td>
                <td>In the past year, the patient has the following number of visits: none in the hospital none in the er and one as an outpatient.</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>nortriptyline [PAMELOR] 50 mg capsule 1 capsule by mouth every bedtime.</td>
                <td>Tylenol Extra Strength 500 mg tablet 2 tablets by mouth every bedtime.</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>bupropion [WELLBUTRIN XL] 300 mg tablet sustained release 24 hour 1 tablet by mouth one time daily.</td>
                <td>Flintstones Complete chewable tablet 1 tablet by mouth two times a day.</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>Given current medication regimen, the following parameters should be monitored by outpatient providers: None</td>
                <td>Given current medication regimen, the following parameters should be monitored by outpatient providers: lithium level</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>The diagnosis and treatment plan were explained to the family/caregiver who expressed understanding of the information presented.</td>
                <td>Explained diagnosis and treatment plan; patient expressed adequate understanding of the information presented today.</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>Learns best by: verbal instructions as procedure is being performed, reading, seeing, listening.</td>
                <td>Learns best by: verbal instruction while procedure is performed, reading, seeing, listening.</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Metric</title>
        <p>We used the Pearson correlation coefficient as an evaluation criterion for the performance of the task. The Pearson correlation coefficient,</p>
        <p>
          <graphic xlink:href="medinform_v9i1e23086_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where <italic>E</italic> is the mathematical expectation (or mean), <italic>D</italic> is the variance, and Cov(<italic>X</italic>,<italic>Y</italic>)=E{ [X – E(X)] [Y – E(Y)]} is the covariance of random variables <italic>X</italic> and <italic>Y</italic>, is used to measure the degree of correlation between 2 variables.</p>
      </sec>
      <sec>
        <title>Experimental Setting</title>
        <p>In the experiments, we used Intel Xeon 2.2 GHz and Nvidia Tesla V100 32 GHz processors. Since we use semisupervised learning and self-ensemble techniques, our model will be stored by the checkpoint. The input dimensions of each of our data sets are the same. The optimal setting for the length of the input sequence is 64, and the optimal setting for the batch size was 32. The optimal setting for the checkpoint was 200. The optimal setting of the training step was 3598. In the experiments, we did not cross-train on the data set.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Performance Comparison</title>
        <p><xref ref-type="table" rid="table4">Table 4</xref> shows the top 5 performance results for the 2019 N2C2 OHNLP Track 1 Clinical STS, the value that we obtained during the challenge, and the value obtained by the method presented in this paper. Our current method achieves a good result—the Pearson correlation coefficient value exceeded the best result by 2 percentage points.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Results on the test set for Clinical STS.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <thead>
              <tr valign="top">
                <td>Methods</td>
                <td>Pearson correlation coefficient</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Multitask learning, ClinicalBERT</td>
                <td>0.90</td>
              </tr>
              <tr valign="top">
                <td>Multitask learning, BERT</td>
                <td>0.89</td>
              </tr>
              <tr valign="top">
                <td>BERT, XLNet</td>
                <td>0.88</td>
              </tr>
              <tr valign="top">
                <td>BERT</td>
                <td>0.87</td>
              </tr>
              <tr valign="top">
                <td>BERT, XLNet</td>
                <td>0.87</td>
              </tr>
              <tr valign="top">
                <td>Our previous method<sup>a</sup></td>
                <td>0.66</td>
              </tr>
              <tr valign="top">
                <td>Our method in this paper</td>
                <td>0.92</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Ordered short long-term memory and attention.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Data Augmentation</title>
        <p>The EDA method uses text replacement and deletion operations, optimizes the selection range of replacement and deletion, and retains the medical proper nouns in the data set. <xref ref-type="table" rid="table5">Table 5</xref> shows the effect of using EDA on the model performance. After EDA, the size of medical data set is expanded, and the model's performance was greatly improved.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Comparison between the model with and without EDA.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <thead>
              <tr valign="top">
                <td>Methods</td>
                <td>Pearson correlation coefficient</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Without EDA<sup>a</sup></td>
                <td>0.88</td>
              </tr>
              <tr valign="top">
                <td>With EDA</td>
                <td>0.92</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>EDA: easy data augmentation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Semisupervised Learning</title>
        <p>The semisupervised learning method uses the general domain data set STS-B for training to solve the problem of insufficient medical data. <xref ref-type="table" rid="table6">Table 6</xref> shows the effect of using semisupervised learning on the model performance. We can see that semisupervised learning can greatly improve the efficiency of the model.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Comparison between the model with and without semisupervised learning.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <thead>
              <tr valign="top">
                <td>Methods</td>
                <td>Pearson correlation coefficient</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Without semisupervised learning</td>
                <td>0.87</td>
              </tr>
              <tr valign="top">
                <td>With semisupervised learning</td>
                <td>0.92</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Self-Ensemble ALBERT</title>
        <p><xref ref-type="table" rid="table7">Table 7</xref> shows the effect of using the self-ensemble method on the model performance. We can see that the efficiency of the model with self-ensemble is better than that of the ordinary ensemble model. Additionally, self-ensemble greatly shortens the training time of the model, reduces the calculation time of the algorithm, and improves the efficiency of the algorithm.</p>
        <p>BERT and ALBERT are pretrained models with the same self-attention structure. As shown in <xref ref-type="table" rid="table8">Table 8</xref>, the performance of ALBERT is better than that of BERT on the Clinical STS data set.</p>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Comparison among the model without ensemble, the model with ensemble, and the model with self-ensemble.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td>Pearson correlation coefficient</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>None</td>
                <td>0.85</td>
              </tr>
              <tr valign="top">
                <td>Ensemble<sup>a</sup></td>
                <td>0.89</td>
              </tr>
              <tr valign="top">
                <td>Self-ensemble</td>
                <td>0.92</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup>Ensemble represents an ensemble method through multiple ALBERT models.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>Comparison between the ALBERT and BERT models.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <thead>
              <tr valign="top">
                <td>Methods</td>
                <td>Runtime (minutes)</td>
                <td>Convergence speed<sup>a</sup> (steps)</td>
                <td>Pearson correlation coefficient</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>BERT</td>
                <td>50</td>
                <td>3300</td>
                <td>0.86</td>
              </tr>
              <tr valign="top">
                <td>ALBERT</td>
                <td>32</td>
                <td>2700</td>
                <td>0.92</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table8fn1">
              <p><sup>a</sup>Convergence speed is measured using the training steps.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Overview</title>
        <p>This paper makes the following contributions. First, we used the EDA text data augmentation method. This method increased the number of data through a series of operations and enriched the semantics of the data. Second, for the problem of insufficient medical data, we used a semisupervised learning method. This method relied on the use of external data to enrich the semantics. Third, to solve the problem of learning complex semantics and the loss of key semantic information, we used the self-ensemble ALBERT model for semantic similarity calculation of clinical text. This method not only improves the results of the semantic similarity calculation of clinical text but also, due to the improvement of the self-ensemble of our model, allows the algorithm to shorten its running time and improve its efficiency. With these techniques, our model obtained a Pearson correlation coefficient of 0.92.</p>
        <p>In order to test the influence of the method on performance, we conducted ablation experiments on EDA, semisupervised learning, and self-ensemble. At the same time, in order to verify the performance of the model, we also performed ablation experiments on ALBERT.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Compared with other models and methods, combining an EDA and self-ensemble ALBERT model under semisupervised learning to perform clinical textual semantic similarity calculations can save a large amount of training time and allows more data to be trained at the same time. This brings great convenience for practical applications and scientific research.</p>
        <p>In the future, we will study how to combine reinforcement learning to process natural language to further improve the performance of the model and handle the dilemma of bloated or erroneous in electronic health records caused by the increasing use of copy and paste.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">EDA</term>
          <def>
            <p>easy data augmentation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">GLUE</term>
          <def>
            <p>General Language Understanding Evaluation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">OHNLP</term>
          <def>
            <p>Open Health Natural Language Processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">N2C2</term>
          <def>
            <p>National NLP Clinical Challenges</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">STS</term>
          <def>
            <p>semantic textual similarity</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the National Natural Science Foundation of China under Grant 61463050, Grant 61762091 and Grant 12061088, and the Science Foundation of Yunnan Education Department under Grant 2020Y0011.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Karwatowski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Russek</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wielgosz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Koryciak</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wiatr</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Energy efficient calculations of text similarity measure on FPGA-accelerated computing platforms</article-title>
          <source>Parallel Processing and Applied Mathematics</source>
          <year>2016</year>
          <month>4</month>
          <day>2</day>
          <volume>9573</volume>
          <fpage>31</fpage>
          <lpage>40</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-319-32149-3_4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-3-319-32149-3_4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wenyin</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Short text similarity based on probabilistic topics</article-title>
          <source>Knowl Inf Syst</source>
          <year>2009</year>
          <month>9</month>
          <day>17</day>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>473</fpage>
          <lpage>491</lpage>
          <pub-id pub-id-type="doi">10.1007/s10115-009-0250-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Question similarity calculation for FAQ answering</article-title>
          <year>2007</year>
          <conf-name>Third International Conference on Semantics Knowledge and Grid (SKG)</conf-name>
          <conf-date>October 29-31</conf-date>
          <conf-loc>Shan Xi, China</conf-loc>
          <fpage>298</fpage>
          <lpage>301</lpage>
          <pub-id pub-id-type="doi">10.1109/skg.2007.247</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>An improved text similarity calculation algorithm based on vsm</article-title>
          <source>AMR</source>
          <year>2011</year>
          <month>4</month>
          <volume>225-226</volume>
          <fpage>1105</fpage>
          <lpage>1108</lpage>
          <pub-id pub-id-type="doi">10.4028/www.scientific.net/amr.225-226.1105</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Deep learning for remote sensing data: a technical tutorial on the state of the art</article-title>
          <source>IEEE Geosci Remote Sens Mag</source>
          <year>2016</year>
          <month>6</month>
          <volume>4</volume>
          <issue>2</issue>
          <fpage>22</fpage>
          <lpage>40</lpage>
          <pub-id pub-id-type="doi">10.1109/mgrs.2016.2540798</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
          </person-group>
          <article-title>GloVe: Global vectors for word representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>
          <year>2014</year>
          <month>10</month>
          <conf-name>19th Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>October 25–29</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1532</fpage>
          <lpage>1543</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kusner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kolkin</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>From word embeddings to document distances</article-title>
          <year>2015</year>
          <conf-name>International Conference on Machine Learning</conf-name>
          <conf-date>July 6-11</conf-date>
          <conf-loc>Lille, France</conf-loc>
          <fpage>957</fpage>
          <lpage>966</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Distributed representation and one-hot representation fusion with gated network for clinical semantic textual similarity</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2020</year>
          <month>04</month>
          <day>30</day>
          <volume>20</volume>
          <issue>Suppl 1</issue>
          <fpage>1</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1045-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-020-1045-z</pub-id>
          <pub-id pub-id-type="medline">32349764</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-020-1045-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC7191689</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ritchie</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Welch</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Categorization of third-party apps in electronic health record app marketplaces: systematic search and analysis</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>05</month>
          <day>29</day>
          <volume>8</volume>
          <issue>5</issue>
          <fpage>e16980</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/5/e16980/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/16980</pub-id>
          <pub-id pub-id-type="medline">32469324</pub-id>
          <pub-id pub-id-type="pii">v8i5e16980</pub-id>
          <pub-id pub-id-type="pmcid">PMC7293052</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cera</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Diabb</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Agirrec</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez-Gazpioc</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Speciad</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>SemEval-2017 Task 1: Semantic textual similarity multilingual and crosslingual focused evaluation</article-title>
          <source>Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)</source>
          <year>2017</year>
          <conf-name>11th International Workshop on Semantic Evaluation</conf-name>
          <conf-date>August 3-4</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>1</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/s17-2001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Overview of the BioCreative/OHNLP challenge 2018 task 2: clinical semantic textual similarity</article-title>
          <source>Proceedings of the 2018 ACM International Conference on Bioinformatics, Computational Biology, and Health Informatics</source>
          <year>2018</year>
          <month>8</month>
          <conf-name>9th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics</conf-name>
          <conf-date>August 29-September 1</conf-date>
          <conf-loc>Washington DC, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3233547.3233672</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The 2019 n2c2/OHNLP track on clinical semantic textual similarity: overview</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>11</month>
          <day>27</day>
          <volume>8</volume>
          <issue>11</issue>
          <fpage>e23375</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/11/e23375/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/23375</pub-id>
          <pub-id pub-id-type="medline">33245291</pub-id>
          <pub-id pub-id-type="pii">v8i11e23375</pub-id>
          <pub-id pub-id-type="pmcid">PMC7732706</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Sijia</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>Naveed</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>Majid</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Liwei</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Feichen</given-names>
            </name>
            <name name-style="western">
              <surname>Kingsbury</surname>
              <given-names>Paul</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Hongfang</given-names>
            </name>
          </person-group>
          <article-title>A comparison of word embeddings for the biomedical natural language processing</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>11</month>
          <volume>87</volume>
          <fpage>12</fpage>
          <lpage>20</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(18)30182-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.09.008</pub-id>
          <pub-id pub-id-type="medline">30217670</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30182-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC6585427</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Long short-term memory neural network for traffic speed prediction using remote microwave sensor data</article-title>
          <source>Transportation Research Part C: Emerging Technologies</source>
          <year>2015</year>
          <month>05</month>
          <volume>54</volume>
          <fpage>187</fpage>
          <lpage>197</lpage>
          <pub-id pub-id-type="doi">10.1016/j.trc.2015.03.014</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Roth</surname>
              <given-names>HR</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Nogues</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mollura</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Summers</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>Deep convolutional neural networks for computer-aided detection: CNN architectures, dataset characteristics and transfer learning</article-title>
          <source>IEEE Trans Med Imaging</source>
          <year>2016</year>
          <month>05</month>
          <volume>35</volume>
          <issue>5</issue>
          <fpage>1285</fpage>
          <lpage>98</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26886976"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/TMI.2016.2528162</pub-id>
          <pub-id pub-id-type="medline">26886976</pub-id>
          <pub-id pub-id-type="pmcid">PMC4890616</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A gated dilated convolution with attention model for clinical cloze-style reading comprehension</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2020</year>
          <month>02</month>
          <day>19</day>
          <volume>17</volume>
          <issue>4</issue>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=ijerph17041323"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph17041323</pub-id>
          <pub-id pub-id-type="medline">32092861</pub-id>
          <pub-id pub-id-type="pii">ijerph17041323</pub-id>
          <pub-id pub-id-type="pmcid">PMC7068278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A convolutional neural network based on a capsule network with strong generalization for bearing fault diagnosis</article-title>
          <source>Neurocomputing</source>
          <year>2019</year>
          <month>01</month>
          <volume>323</volume>
          <fpage>62</fpage>
          <lpage>75</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2018.09.050</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>AN</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <year>2017</year>
          <conf-name>31st Conference on Neural Information Processing Systems (NIPS 2017)</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <fpage>5998</fpage>
          <lpage>6008</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bertinetto</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Valmadre</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Henriques</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Vedaldi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Torr</surname>
              <given-names>PHS</given-names>
            </name>
          </person-group>
          <article-title>Fully-convolutional siamese networks for object tracking</article-title>
          <year>2016</year>
          <month>11</month>
          <conf-name>European Conference on Computer Vision</conf-name>
          <conf-date>October 8-16</conf-date>
          <conf-loc>Amsterdam, Netherlands</conf-loc>
          <fpage>850</fpage>
          <lpage>865</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-319-48881-3_56"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-3-319-48881-3_56</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Where-and-when to look: deep siamese attention networks for video-based person re-identification</article-title>
          <source>IEEE Transactions on Multimedia</source>
          <year>2019</year>
          <month>6</month>
          <volume>21</volume>
          <issue>6</issue>
          <fpage>1412</fpage>
          <lpage>1424</lpage>
          <pub-id pub-id-type="doi">10.1109/tmm.2018.2877886</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eisinga</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Grotenhuis</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Pelzer</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>The reliability of a two-item scale: Pearson, Cronbach, or Spearman-Brown?</article-title>
          <source>Int J Public Health</source>
          <year>2013</year>
          <month>08</month>
          <volume>58</volume>
          <issue>4</issue>
          <fpage>637</fpage>
          <lpage>42</lpage>
          <pub-id pub-id-type="doi">10.1007/s00038-012-0416-3</pub-id>
          <pub-id pub-id-type="medline">23089674</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>Hwejin</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Bumsoo</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>Inyeop</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>Junhyun</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>Jaewoo</given-names>
            </name>
          </person-group>
          <article-title>Classification of lung nodules in CT scans using three-dimensional deep convolutional neural networks with a checkpoint ensemble method</article-title>
          <source>BMC Med Imaging</source>
          <year>2018</year>
          <month>12</month>
          <day>03</day>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>48</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedimaging.biomedcentral.com/articles/10.1186/s12880-018-0286-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12880-018-0286-0</pub-id>
          <pub-id pub-id-type="medline">30509191</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12880-018-0286-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6276244</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gimpel</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Soricut</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Albert: A lite bert for self-supervised learning of language representations</article-title>
          <year>2019</year>
          <conf-name>International Conference on Learning Representations</conf-name>
          <conf-date>April 26-30</conf-date>
          <conf-loc>Addis Ababa</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>JND</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Semi-supervised and unsupervised extreme learning machines</article-title>
          <source>IEEE Trans Cybern</source>
          <year>2014</year>
          <month>12</month>
          <volume>44</volume>
          <issue>12</issue>
          <fpage>2405</fpage>
          <lpage>2417</lpage>
          <pub-id pub-id-type="doi">10.1109/TCYB.2014.2307349</pub-id>
          <pub-id pub-id-type="medline">25415946</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Enguehard</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>O'Halloran</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gholipour</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Semi-supervised learning with deep embedded clustering for image classification and segmentation</article-title>
          <source>IEEE Access</source>
          <year>2019</year>
          <volume>7</volume>
          <fpage>11093</fpage>
          <lpage>11104</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2019.2891970</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Eda: Easy data augmentation techniques for boosting performance on text classification tasks</article-title>
          <source>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</source>
          <year>2019</year>
          <month>11</month>
          <conf-name>EMNLP-IJCNLP 2019</conf-name>
          <conf-date>November 3-7</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>6382</fpage>
          <lpage>6388</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1670</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>A W</given-names>
            </name>
            <name name-style="western">
              <surname>Dohan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Luong</surname>
              <given-names>M T</given-names>
            </name>
          </person-group>
          <article-title>Qanet: Combining local convolution with global self-attention for reading comprehension</article-title>
          <year>2018</year>
          <conf-name>International Conference on Learning Representations</conf-name>
          <conf-date>April 30-May 3</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S I</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Data noising as smoothing in neural network language models</article-title>
          <year>2017</year>
          <conf-name>International Conference on Learning Representations</conf-name>
          <conf-date>April 24-26</conf-date>
          <conf-loc>Toulon, France</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hussain</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cambria</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Semi-supervised learning for big social data analysis</article-title>
          <source>Neurocomputing</source>
          <year>2018</year>
          <month>01</month>
          <volume>275</volume>
          <fpage>1662</fpage>
          <lpage>1673</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2017.10.010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joo</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bae</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A gift from knowledge distillation: fast optimization, network minimization and transfer learning</article-title>
          <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>
          <year>2017</year>
          <month>11</month>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>July 21-26</conf-date>
          <conf-loc>Honolulu, HI, USA</conf-loc>
          <fpage>4133</fpage>
          <lpage>4141</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2017.754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A novel enhanced collaborative autoencoder with knowledge distillation for top-N recommender systems</article-title>
          <source>Neurocomputing</source>
          <year>2019</year>
          <month>03</month>
          <volume>332</volume>
          <fpage>137</fpage>
          <lpage>148</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2018.12.025</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Williams A, Nangia N, Bowman S R</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nangia</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>S R</given-names>
            </name>
          </person-group>
          <article-title>A broad-coverage challenge corpus for sentence understanding through Inference</article-title>
          <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2018</year>
          <conf-name>2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 1-6</conf-date>
          <conf-loc>New Orleans, Louisiana</conf-loc>
          <fpage>1112</fpage>
          <lpage>1122</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zampieri</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nakov</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenthal</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>SemEval-2020 task 12: Multilingual offensive language identification in social media (OffensEval 2020)</article-title>
          <source>Proceedings of the Fourteenth Workshop on Semantic Evaluation</source>
          <year>2020</year>
          <conf-name>The 28th International Conference on Computational Lingustics (COLING-2020)</conf-name>
          <conf-date>September 13-14</conf-date>
          <conf-loc>Barcelona (online)</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>1425</fpage>
          <lpage>1447</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H Q</given-names>
            </name>
          </person-group>
          <article-title>Dynamic causality knowledge graph generation for supporting the Chatbot health care system</article-title>
          <source>Proceedings of the Future Technologies Conference (FTC) 2020</source>
          <year>2020</year>
          <conf-name>Future Technologies Conference (FTC) 2020</conf-name>
          <conf-date>October</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>30</fpage>
          <lpage>45</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-63092-8_3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Towards medical machine reading comprehension with structural knowledge and plain text</article-title>
          <source>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>
          <year>2020</year>
          <conf-name>2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>November</conf-date>
          <conf-loc>Online</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>1427</fpage>
          <lpage>1438</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.111</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
