<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i5e17644</article-id>
      <article-id pub-id-type="pmid">32469325</article-id>
      <article-id pub-id-type="doi">10.2196/17644</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Document-Level Biomedical Relation Extraction Leveraging Pretrained Self-Attention Structure and Entity Replacement: Algorithm and Pretreatment Method Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Hao</surname>
            <given-names>Tianyong</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Tang</surname>
            <given-names>Buzhou</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Huang</surname>
            <given-names>Zhengxing</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhao</surname>
            <given-names>Zhenyu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Xiaofeng</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6111-7753</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Fan</surname>
            <given-names>Jianye</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4561-9910</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Dong</surname>
            <given-names>Shoubin</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Communication and Computer Network Key Laboratory of Guangdong</institution>
            <institution>School of Computer Science and Engineering</institution>
            <institution>South China University of Technology</institution>
            <addr-line>No. 381, Wushan Road</addr-line>
            <addr-line>Tianhe District, Guangdong</addr-line>
            <addr-line>Guangzhou, 510610</addr-line>
            <country>China</country>
            <phone>86 15625125397</phone>
            <email>sbdong@scut.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0153-850X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Communication and Computer Network Key Laboratory of Guangdong</institution>
        <institution>School of Computer Science and Engineering</institution>
        <institution>South China University of Technology</institution>
        <addr-line>Guangzhou</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Shoubin Dong <email>sbdong@scut.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>5</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>29</day>
        <month>5</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>5</issue>
      <elocation-id>e17644</elocation-id>
      <history>
        <date date-type="received">
          <day>30</day>
          <month>12</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>14</day>
          <month>2</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>2</day>
          <month>3</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>19</day>
          <month>3</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Xiaofeng Liu, Jianye Fan, Shoubin Dong. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 29.05.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2020/5/e17644/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The most current methods applied for intrasentence relation extraction in the biomedical literature are inadequate for document-level relation extraction, in which the relationship may cross sentence boundaries. Hence, some approaches have been proposed to extract relations by splitting the document-level datasets through heuristic rules and learning methods. However, these approaches may introduce additional noise and do not really solve the problem of intersentence relation extraction. It is challenging to avoid noise and extract cross-sentence relations.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to avoid errors by dividing the document-level dataset, verify that a self-attention structure can extract biomedical relations in a document with long-distance dependencies and complex semantics, and discuss the relative benefits of different entity pretreatment methods for biomedical relation extraction.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>This paper proposes a new data preprocessing method and attempts to apply a pretrained self-attention structure for document biomedical relation extraction with an entity replacement method to capture very long-distance dependencies and complex semantics.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Compared with state-of-the-art approaches, our method greatly improved the precision. The results show that our approach increases the F1 value, compared with state-of-the-art methods. Through experiments of biomedical entity pretreatments, we found that a model using an entity replacement method can improve performance.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>When considering all target entity pairs as a whole in the document-level dataset, a pretrained self-attention structure is suitable to capture very long-distance dependencies and learn the textual context and complicated semantics. A replacement method for biomedical entities is conducive to biomedical relation extraction, especially to document-level relation extraction.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>self-attention</kwd>
        <kwd>document-level</kwd>
        <kwd>relation extraction</kwd>
        <kwd>biomedical entity pretreatment</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>A large number of biomedical entity relations exist in the biomedical literature. It is beneficial for the development of biomedical fields to automatically and accurately extract these relations and form structured knowledge. Some biomedical datasets have been proposed for extracting biomedical relations, such as drug-drug interactions (DDI) [<xref ref-type="bibr" rid="ref1">1</xref>], chemical-protein relations (CPR) [<xref ref-type="bibr" rid="ref2">2</xref>], and chemical-induced diseases (CID) [<xref ref-type="bibr" rid="ref3">3</xref>]. The former 2 datasets are sentence-level annotated datasets that extract relations on a single sentence containing a single entity-pair mention, and the latter is a document-level annotated dataset, which means that it is uncertain whether relations are asserted from within sentences or across sentence boundaries.</p>
      <p>Most approaches [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>] have focused on single sentences containing biomedical relations. For example, Zhang et al [<xref ref-type="bibr" rid="ref4">4</xref>] presented a hierarchical recurrent neural network (RNN) to combine raw sentences with their short dependency paths for a DDI task. To deal with long and complicated sentences, Sun et al [<xref ref-type="bibr" rid="ref5">5</xref>] separated sequences into short context subsequences and proposed a hierarchical recurrent convolutional neural network (CNN). Because these approaches cannot be directly applied to document-level datasets, some existing methods [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] divided the document-level dataset into 2 parts and trained an intrasentence model and an intersentence model. Nevertheless, because of long-distance dependencies and co-references, their methods cannot be adapted to cross-sentence relation extraction. Furthermore, splitting the dataset resulted in noise and rule-based mistakes.</p>
      <p>Currently, for intersentence relation extraction, some studies [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>] generate dependency syntax trees within sentences and across sentences and employ a graph neural network to capture dependencies. However, it is costly to build dependency syntax trees. In addition, few studies, except those by Li et al [<xref ref-type="bibr" rid="ref13">13</xref>] and Verga et al [<xref ref-type="bibr" rid="ref14">14</xref>], have considered the influence of noisy data due to the segmentation of datasets and taking advantage of the textual context. For a document-level annotated corpus, an entity-pair mention within sentences or across sentences has a biomedical relationship by thinking simply, which will undoubtedly cause errors and may ignore plenty of useful information such that many sentences with co-occurring or co-referential medical entity mentions refer to biomedical relations.</p>
      <p>For example, the chemical-disease relation (CDR) dataset is a document-level corpus designed to extract CID relations from biomedical literature [<xref ref-type="bibr" rid="ref15">15</xref>]. For CID relation extraction, most current methods [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>] divide the CDR dataset into intrasentence-level and intersentence-level relation instances using heuristic rules. Although these heuristic rules are effective, they inevitably generate noisy instances of CID relations or ignore some useful information. For example, the following sentence expresses CID relations between the chemical amitriptyline and the disease blurred vision: “The overall incidence of side effects and the frequency and severity of blurred vision, dry mouth, and drowsiness were significantly less with dothiepin than with amitriptyline.”</p>
      <p>According to heuristic rules [<xref ref-type="bibr" rid="ref8">8</xref>], the token distance between two mentions in an intrasentence level instance should be &#60;10. The token distance between the chemical amitriptyline and the disease blurred vision in this example is 12; therefore, this sentence is discarded. However, factually, this sentence is the only sentence in the document [<xref ref-type="bibr" rid="ref18">18</xref>] that describes the CID relation between the chemical amitriptyline and the disease blurred vision. Obviously, heuristic rules cannot precisely partition the CDR dataset, and they can induce the wrong classification by models, although they use multi-instance learning to reduce these errors.</p>
      <p>Therefore, when constructing relation instances from a document-level dataset, it is necessary to consider sentences with multiple mentions of target entities in the entire document. While treating all target entities in a document as a whole brings benefits, the challenges are very long-distance dependencies and complex semantics, from which traditional neural networks such as CNN or RNN cannot accurately extract document-level relations. Recently, pretrained self-attention structures, such as SciBERT [<xref ref-type="bibr" rid="ref19">19</xref>] and BERT [<xref ref-type="bibr" rid="ref20">20</xref>], were proposed and were not necessarily better than RNN at capturing long-range dependencies. However, they performed better at increasing the number of attention heads [<xref ref-type="bibr" rid="ref21">21</xref>]. A pretrained transformer has already learned more semantic features, and it performs well for sentence-level relation extraction; however, it did not apply to document-level relation extraction.</p>
      <p>To address these problems, this paper proposes a pretrained self-attention mechanism and entity replacement method to extract document-level relationships. In this way, this paper has several contributions. First, to avoid errors by dividing the document-level dataset, this paper proposes a new data preprocessing method that treats the target entity pair of some sentences in a document as an instance. Second, to better focus on the target entity pairs and their context, a replacement method is proposed to replace biomedical entity pairs with uniform words. Compared with the different entity preprocessing for biomedical entity pairs, the replacement method is more effective for biomedical relation extraction. Third, to solve the problem of long-distance dependencies and learning complex semantics, a pretrained self-attention structure is proposed for document-level relation extraction and to achieve superior performance than state-of-the-art approaches. Through analysis and visualization of the model structure, the effectiveness of the self-attention structure for document-level datasets is demonstrated.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Preprocessing for the Document-Level Corpus</title>
        <p>As already mentioned, splitting the document-level corpus will increase noise and may lose some useful information. To address this problem, the sentences in which the target entity pair is located and the sentences between them are constructed to an instance. This approach has the following benefits. First, it does not introduce error messages. The sentences do not need to be labeled after the segmentation of the dataset. The relationship between the marked relation pairs in the document corresponds to the instances one by one. Second, it discards useless information that is not related to the relationship of the target entities. Some are not related to those sentences in which the target entities are located; hence, they are noise for relation extraction. Discarding them will focus the model on the sentences in which the entity pair is located. Third, it keeps a lot of useful information, such as contextual information about entities and the relationship of entities.</p>
        <p>As shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, a document [<xref ref-type="bibr" rid="ref22">22</xref>] in the CDR dataset is constructed into biomedical relation instances. All chemical or disease entities are bold.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>An example of document-level relation instance construction.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17644_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>In this document, there are 2 chemical entities, “amisulpride” (C1) and “calcium gluconate” (C2), and 4 disease entities: “overdose” (D1), “prolonged QT syndrome/QT prolongation” (D2), “poisoning” (D3), and “hypocalcemia” (D4). It should be noted that C1, C2, D1, D2, D3, and D4 are added to the document to indicate which are chemical entities and which are disease entities. Hence, the document can be constructed into 8 instances, in which 2 instances of C1 and D2 or C1 and D4 have CID relations. a), b), c), and d) conformed the instance of C1 and D2. a), b), and c) conformed the instance of C1 and D4.</p>
        <p>Semantically, both the intralevel sentence a) and the interlevel sentences b) and c) express the CID relationship of C1 and D2. However, according to heuristic rules, b) and c) will be discarded because only the entities that are not involved in any intrasentence level instance are considered at the intersentence level. Third, instances are full of contextual information of chemical and disease entities, which is conducive to document-level relation extraction when exploiting it well.</p>
        <p>There are lots of biomedical entities in a document. When constructing the instances of the target entity pair, it is inevitable that the same instance is tagged with different labels, resulting in incorrect classification. For example, as mentioned in the Methods section, the instances of C1-D2 and C2-D2 are the same but tagged with different labels. To solve this problem, entity pretreatment methods are presented.</p>
        <p>There are 2 different biomedical entity pretreatments, as shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>. In the first pretreatment, the target chemical and disease entities are respectively replaced with “chemical” and “disease,” which are called the replacement method. For example, in the instance of C1 and D2, sentence a) will be processed into “Two cases of chemical entity: a cause for disease.” In addition to the replacement method, there is another data preprocessing method, called the addition method. Different marks are added to the boundaries of chemical and disease entities, related to the relation instance. For instance, sentence a) will be processed into “Two cases of [[ amisulpride ]] overdose: a cause for &#60;&#60; prolonged QT syndrome &#62;&#62;”. In the Results section, we will describe the advantages and disadvantages of the 2 different pretreatment methods.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>An instance with two different biomedical entity pretreatments.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17644_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Model Architecture</title>
        <p>As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, when adopting this data preprocessing, the length of most instances is very long, which results from very long-distance dependencies and complex semantics.</p>
        <p>Self-attention structure can directly calculate similarities between words, so that the distance between words is 1, which can intuitively solve long-distance dependencies. As demonstrated by Tang et al [<xref ref-type="bibr" rid="ref21">21</xref>], Transformer, a combined self-attention structure, is capable of semantic feature extraction far exceeding that of RNN and CNN and performs better when increasing the number of attention structures. Therefore, a pretrained self-attention structure, namely a pretrained transformer, is applied for these problems.</p>
        <p>However, for document-level relationship extraction, according to our preprocessing method, the length of instances is longer than the experimental data in the paper by Tang et al [<xref ref-type="bibr" rid="ref21">21</xref>], and the semantics are more complicated. There are multiple target entity pairs in the instances; some reflect the correct relationship, and some do not. Therefore, the transformer structure must have a certain reasoning ability. To verify the validity of a pretrained self-attention structure on document-level relation extraction, we adopted the structure of SciBERT, which was pretrained on the scientific literature, and added a feed-forward network (FNN) as a classifier. A visual model architecture is provided in <xref rid="figure3" ref-type="fig">Figure 3</xref>. We fine-tuned the model on the preprocessed CDR dataset. The structure of model is described in detail in the following paragraphs.</p>
        <p>Basde on the structure of BERT, SciBERT built a new vocabulary, called SCIVOCAB, and was trained on a scientific corpus that consists of computer science domain and biomedical papers. Following SciBERT, we still employ the same input representation, constructed by summation of token embedding, segment embedding, and position embedding. The tokens “[CLS]” and “[SEP]” are added at the beginning and end, respectively, of each instance. In addition, when tokenizing words, WordPiece embedding [<xref ref-type="bibr" rid="ref23">23</xref>] was used with SCIVOCAB to separate words and split word pieces with “##”.</p>
        <p>SciBERT is made up of N transformer stacks. Transformer stack <italic>k</italic> is denoted by <italic>Transformer<sub>k</sub></italic>, which has its own parameters and consists of 2 components: multi-head attention and FFN.</p>
        <disp-formula>Sk = Transformerk(Sk-1) (<bold>1</bold>)</disp-formula>
        <p>where <italic>S<sup>k</sup></italic> ∈ <italic>R<sup>n</sup></italic><sup>×</sup><italic><sup>d</sup></italic> is the output of the transformer stack <italic>k</italic>. S<sup>0</sup> is the input representation of text sequence <italic>X, X</italic> ∈<italic>R<sup>n</sup></italic><sup>×</sup><italic><sup>d</sup></italic>. <italic>n</italic> is the length of text sequence, and <italic>d</italic> is the dimension of input representation. The whole text sequence shares the same parameters as the transformers.</p>
        <p>The multihead applies self-attention, or scaled dot-product attention, multiple times. Through the mapping of the query <italic>Q</italic>, key <italic>K</italic>, and value <italic>V</italic>, scaled dot-product attention obtains a weighted sum of the values. <italic>Q, K, V</italic>∈ <italic>R<sup>n</sup></italic><sup>×</sup><italic><sup>d</sup></italic> are the same matrices in the self-attention computation that are the input of transformer.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>The architecture of the model.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17644_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Instead of applying a single scaled dot-product attention, multihead attention applies query <italic>Q</italic>, key <italic>K</italic>, and value <italic>V</italic> to linearly project the input <italic>h</italic> times with different, learned linear projections to <italic>n</italic> x <italic>l</italic> dimensions, respectively, where <italic>l = d/h</italic> and <italic>h</italic> is the number of the head. The reason for that is multihead attention can form different representation subspaces at different positions, learn more semantic information, and capture long-distance dependencies better.</p>
        <disp-formula>Oh = softmax(QWiQ(KWiK)T / sqrt(dh))VWiV (<bold>2</bold>)</disp-formula>
        <p>Where the projections are parameter matrices <italic>W<sup>Q</sup></italic> ∈ <italic>R<sup>d</sup></italic><sup>×</sup><italic><sup>l</sup></italic>, <italic>W<sup>K</sup></italic> ∈ <italic>R<sup>d</sup></italic><sup>×</sup><italic><sup>l</sup></italic>, <italic>W<sup>V</sup></italic> ∈ <italic>R<sup>d</sup></italic><sup>×</sup><italic><sup>l</sup></italic>, and <italic>O<sub>h</sub></italic> ∈ <italic>R<sup>d</sup></italic><sup>x</sup><italic><sup>l</sup>.</italic> <italic>sqrt(d<sub>h</sub>)</italic> is a scale factor to prevent the result of the dot-product attention from enlarging, and <italic>sqrt(‧)</italic> indicates that the square root is extracted.</p>
        <p>Then, the outputs of the individual attention heads are merged, denoted as <italic>O</italic> ∈ <italic>R<sup>d</sup></italic><sup>x</sup><italic><sup>l</sup></italic>. The input and output of the multihead attention are connected by residual connection. Layer normalization, denoted <italic>LN</italic>, is applied to the output of the residual connection.</p>
        <disp-formula><italic>O =</italic> [<italic>o<sub>1</sub>;...;o<sub>h</sub></italic>] (<bold>3</bold>)</disp-formula>
        <disp-formula><italic>M</italic> = <italic>LN</italic>(<italic>S<sup>k-1</sup></italic>+<italic>O</italic>) (<bold>4</bold>)</disp-formula>
        <p>Where <italic>M</italic> ∈ <italic>R<sup>n</sup></italic><sup>×</sup><italic><sup>d</sup></italic></p>
        <p>The second component of the transformer stack is 2 layers of pointwise FFN. On the other hand, it can be described as 2 convolutions with kernel size 1.</p>
        <disp-formula><italic>S<sup>k</sup> = ReLU</italic>(<italic>MW<sub>1</sub></italic>+<italic>b<sub>1</sub></italic>)<italic>W<sub>2</sub></italic>+<italic>b<sub>2</sub></italic> (<bold>5</bold>)</disp-formula>
        <p>where <italic>W<sub>1</sub></italic> ∈ <italic>R<sup>d×m</sup>, b<sub>1</sub></italic> ∈ <italic>R<sup>n×m</sup>, W<sub>2</sub></italic> ∈ <italic>R<sup>m×d</sup>,</italic> and <italic>b<sub>2</sub></italic> ∈ <italic>R<sup>n×d</sup></italic>. Each row of or is the same, and <italic>m</italic> = <italic>4d</italic>.</p>
        <p>The final layer is an FFN, a relation classifier. It corresponds to the final output of transformer of the token “[CLS]”.</p>
        <disp-formula><italic>c = W<sup>pred</sup>s<sub>1</sub></italic> (<bold>6</bold>)</disp-formula>
        <p>Where <italic>W<sup>pred</sup></italic> ∈ <italic>R<sup>o×d</sup></italic> is the weight matrix and s ∈ <italic>R<sup>d</sup></italic> is the final output of the token “[CLS]”.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview</title>
        <p>In this section, we first describe some experimental datasets and provide some experiment settings. Then, we compare the performance of SciBERT with that of existing methods and validate the availability of the pretrained self-attention structure on the document-level dataset through the visualization of the multihead attention. Finally, experimenting on different datasets, including 2 sentence-level corpora and a document-level corpus, we compare various biomedical entity pretreatments and analyze which preprocessing is better for the self-attention structure.</p>
      </sec>
      <sec>
        <title>Datasets</title>
        <p><xref ref-type="table" rid="table1">Table 1</xref> shows the statistics of the CDR [<xref ref-type="bibr" rid="ref3">3</xref>], protein-protein interactions affected by mutations (PPIm) [<xref ref-type="bibr" rid="ref24">24</xref>], DDI [<xref ref-type="bibr" rid="ref1">1</xref>], and CPR [<xref ref-type="bibr" rid="ref2">2</xref>] datasets. The CDR and PPIm datasets are document-level annotated corpus, and the DDI and CPR datasets are sentence-level annotated corpora, which are only used to discuss the advantages and disadvantages of different biomedical entity pretreatments.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Descriptions of the chemical-disease relation datasets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="230"/>
            <col width="250"/>
            <col width="320"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Dataset, Types</td>
                <td>Training set</td>
                <td>Development set</td>
                <td>Test set</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>CDR<sup>a</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="3">
                  <break/>
                </td>
                <td>Documents</td>
                <td>500</td>
                <td>500</td>
                <td>500</td>
              </tr>
              <tr valign="top">
                <td>Positive</td>
                <td>1038</td>
                <td>1012</td>
                <td>1066</td>
              </tr>
              <tr valign="top">
                <td>Negative</td>
                <td>4324</td>
                <td>4134</td>
                <td>4374</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>PPIm<sup>b</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="3">
                  <break/>
                </td>
                <td>Documents</td>
                <td>597</td>
                <td>N/A<sup>c</sup></td>
                <td>635</td>
              </tr>
              <tr valign="top">
                <td>Positive</td>
                <td>750</td>
                <td>N/A<sup>c</sup></td>
                <td>869</td>
              </tr>
              <tr valign="top">
                <td>Negative</td>
                <td>1401</td>
                <td>N/A<sup>c</sup></td>
                <td>1717</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>DDI<sup>d</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="7">
                  <break/>
                </td>
                <td>Sentence</td>
                <td>18,872</td>
                <td>N/A<sup>c</sup></td>
                <td>3843</td>
              </tr>
              <tr valign="top">
                <td>Positive</td>
                <td>3964</td>
                <td>N/A<sup>c</sup></td>
                <td>970</td>
              </tr>
              <tr valign="top">
                <td>Negative</td>
                <td>14,908</td>
                <td>N/A<sup>c</sup></td>
                <td>2873</td>
              </tr>
              <tr valign="top">
                <td>Int</td>
                <td>183</td>
                <td>N/A<sup>c</sup></td>
                <td>96</td>
              </tr>
              <tr valign="top">
                <td>Advice</td>
                <td>815</td>
                <td>N/A<sup>c</sup></td>
                <td>219</td>
              </tr>
              <tr valign="top">
                <td>Effect</td>
                <td>1654</td>
                <td>N/A<sup>c</sup></td>
                <td>357</td>
              </tr>
              <tr valign="top">
                <td>Mechanism</td>
                <td>1312</td>
                <td>N/A<sup>c</sup></td>
                <td>298</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>CPR<sup>e</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="8">
                  <break/>
                </td>
                <td>Sentences</td>
                <td>6437</td>
                <td>3558</td>
                <td>5744</td>
              </tr>
              <tr valign="top">
                <td>Positive</td>
                <td>4172</td>
                <td>2427</td>
                <td>3469</td>
              </tr>
              <tr valign="top">
                <td>Negative</td>
                <td>2265</td>
                <td>1131</td>
                <td>2275</td>
              </tr>
              <tr valign="top">
                <td>CPR:3</td>
                <td>777</td>
                <td>552</td>
                <td>667</td>
              </tr>
              <tr valign="top">
                <td>CPR:4</td>
                <td>2260</td>
                <td>1103</td>
                <td>1667</td>
              </tr>
              <tr valign="top">
                <td>CPR:5</td>
                <td>173</td>
                <td>116</td>
                <td>198</td>
              </tr>
              <tr valign="top">
                <td>CPR:6</td>
                <td>235</td>
                <td>199</td>
                <td>293</td>
              </tr>
              <tr valign="top">
                <td>CPR:9</td>
                <td>727</td>
                <td>457</td>
                <td>644</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>CDR: chemical-disease relation.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>PPIm: protein-protein interaction affected by mutations.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Development sets do not exist in the PPIm an dDDI datasets.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>DDI: drug-drug interaction.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>CPR: chemical-protein relation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The CDR dataset is used to extract CID and is a 2-label classification task. The PPIm dataset is released to extract protein-protein interactions affected by genetic mutations, which is a 2-label classification. Aimed at extracting drug-drug interactions, DDI is concerned with classifying into 5 relation types, including the int type, advice type, effect type, mechanism type, and negative type. For the DDI dataset, we adopted some rules to filter some negative sentences as described by Quan et al [<xref ref-type="bibr" rid="ref25">25</xref>]. With the purpose of extracting chemical-protein relations, the CPR dataset is labeled as 10 types of chemical-protein relations, 5 of which are used for evaluation. The chemical-protein relations of CPR are classified into 6 categories.</p>
        <p>Due to the size of the CDR dataset, we merged the training and development sets to construct the training set. After preprocessing the CDR and PPIm datasets, we counted the average number of sentences per instance, average number of tokens per instance, and average number of tokens per sentence in the constructed instance set. <xref ref-type="table" rid="table2">Table 2</xref> shows the statistics of the constructed instance set.</p>
      </sec>
      <sec>
        <title>Experiment Setup</title>
        <p>We employed the parameters of the uncased SciBERT with the vocabulary SCIVOCAB and fine-tuned on the CDR datasets. The model parameters are described as: SciBERT<sub>uncased</sub>: <italic>k</italic> = 12, <italic>h</italic> = 12, <italic>d</italic> = 768, <italic>m</italic> = 3072.</p>
        <p>Due to the distinction of the length of instances in the dataset, the input dimensions of the corresponding model for each dataset are different. For the CDR and PPIm datasets, the length of the input sequence is set to 512, and the batch size is set to 6. For the DDI dataset, the length of the input sequence is set to 150, and the batch size is set to 32. For the CPR dataset, the length of the input sequence is set to 200, and the batch size is set to 23. The epoch of all model training is set to 3. All results are averaged across 5 runs. For consistency of comparisons, we merged the training and development sets to train the models.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Statistics of the constructed instance sets of the chemical-disease relation (CDR) and protein-protein interaction affected by mutations (PPIm) datasets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="470"/>
            <col width="250"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Dataset, Types</td>
                <td>Training set</td>
                <td>Test set</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>CDR with preprocessing</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="6">
                  <break/>
                </td>
                <td>Instances</td>
                <td>10,407</td>
                <td>5418</td>
              </tr>
              <tr valign="top">
                <td>Positive<break/>  <break/>  </td>
                <td>1947</td>
                <td>1042</td>
              </tr>
              <tr valign="top">
                <td>Negative</td>
                <td>8460</td>
                <td>4376</td>
              </tr>
              <tr valign="top">
                <td>Sentences per instance</td>
                <td>11.1</td>
                <td>12.1</td>
              </tr>
              <tr valign="top">
                <td>Tokens per instance</td>
                <td>161.5</td>
                <td>168.9</td>
              </tr>
              <tr valign="top">
                <td>Tokens per sentence</td>
                <td>14.6</td>
                <td>14.0</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>PPIm with preprocessing</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="6">
                  <break/>
                </td>
                <td>Instances</td>
                <td>2151</td>
                <td>2586</td>
              </tr>
              <tr valign="top">
                <td>Positive</td>
                <td>750</td>
                <td>869</td>
              </tr>
              <tr valign="top">
                <td>Negative</td>
                <td>1401</td>
                <td>1717</td>
              </tr>
              <tr valign="top">
                <td>Sentences per instance</td>
                <td>9.0</td>
                <td>8.8</td>
              </tr>
              <tr valign="top">
                <td>Tokens per instance</td>
                <td>169.6</td>
                <td>186.6</td>
              </tr>
              <tr valign="top">
                <td>Tokens per sentence</td>
                <td>18.7</td>
                <td>21.2</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparison of the Pretrained Self-Attention Structure With Other Methods</title>
        <p>For the CDR dataset, we compared our method with 6 state-of-the-art models without any knowledge bases. Zhou et al [<xref ref-type="bibr" rid="ref9">9</xref>] proposed a method based on feature engineering and long short-term memory. Gu et al [<xref ref-type="bibr" rid="ref8">8</xref>] combined CNN with maximum entropy. A recurrent piecewise CNN [<xref ref-type="bibr" rid="ref13">13</xref>] was the piecewise CNN. A bi-affine relation attention network [<xref ref-type="bibr" rid="ref14">14</xref>] incorporated an attention network, multi-instance learning, and multitask learning. A labeled edge graph CNN [<xref ref-type="bibr" rid="ref12">12</xref>] was used for document-level dependency graphs. For the PPIm dataset, we compared our method with 4 models. Because few studies focused on the PPIm dataset, the 4 models are not really state-of-the-art. <xref ref-type="table" rid="table3">Table 3</xref> shows the result of the comparisons.</p>
        <p>As shown in <xref ref-type="table" rid="table3">Table 3</xref>, compared with other approaches, our method with the replacement method greatly improved the precision. The F1 score is 1.9% higher than the best result from Vargas et al [<xref ref-type="bibr" rid="ref19">19</xref>] with the CDR test set. Our method also has great performance with the PPIm. It shows that a pretrained self-attention structure can be suitable for a document-level dataset.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Performance of the chemical-disease relation (CDR) and protein-protein interactions affected by mutations (PPIm) test datasets compared with state-of-the-art methods.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="460"/>
            <col width="170"/>
            <col width="170"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Dataset, Model</td>
                <td>P<sup>a</sup>, %</td>
                <td>R<sup>b</sup>, %</td>
                <td>F1, %</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>CDR</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="6">
                  <break/>
                </td>
                <td>LSTM<sup>c</sup> [<xref ref-type="bibr" rid="ref9">9</xref>]</td>
                <td>55.6</td>
                <td>68.4</td>
                <td>61.3</td>
              </tr>
              <tr valign="top">
                <td>CNN<sup>d</sup> [<xref ref-type="bibr" rid="ref8">8</xref>]</td>
                <td>55.7</td>
                <td>68.1</td>
                <td>61.3</td>
              </tr>
              <tr valign="top">
                <td>RPCNN<sup>e</sup> [<xref ref-type="bibr" rid="ref13">13</xref>]</td>
                <td>55.2</td>
                <td>63.6</td>
                <td>59.1</td>
              </tr>
              <tr valign="top">
                <td>BRAN<sup>f</sup> [<xref ref-type="bibr" rid="ref14">14</xref>]</td>
                <td>55.6</td>
                <td>70.8</td>
                <td>62.1</td>
              </tr>
              <tr valign="top">
                <td>GCNN<sup>g</sup> [<xref ref-type="bibr" rid="ref12">12</xref>]</td>
                <td>52.8</td>
                <td>66.0</td>
                <td>58.6</td>
              </tr>
              <tr valign="top">
                <td>Our method</td>
                <td>65.5</td>
                <td>62.6</td>
                <td>64.0</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>PPIm</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="5">
                  <break/>
                </td>
                <td>SVM<sup>h</sup> [<xref ref-type="bibr" rid="ref26">26</xref>]</td>
                <td>32.0</td>
                <td>34.0</td>
                <td>33.0</td>
              </tr>
              <tr valign="top">
                <td>CNN (without KB<sup>i</sup>) [<xref ref-type="bibr" rid="ref27">27</xref>]</td>
                <td>38.2</td>
                <td>37.3</td>
                <td>37.8</td>
              </tr>
              <tr valign="top">
                <td>MNM<sup>j</sup> [<xref ref-type="bibr" rid="ref28">28</xref>]</td>
                <td>40.3</td>
                <td>32.3</td>
                <td>35.9</td>
              </tr>
              <tr valign="top">
                <td>MNM+Rule [<xref ref-type="bibr" rid="ref28">28</xref>]</td>
                <td>38.0</td>
                <td>37.0</td>
                <td>37.5</td>
              </tr>
              <tr valign="top">
                <td>Our method</td>
                <td>83.5</td>
                <td>90.4</td>
                <td>86.8</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>P: precision.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>R: recall.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>RPCNN: recurrent piecewise convolutional neural network.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>BRAN: bi-affine relation attention network.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>GCNN: graph convolutional neural network.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>SVM: support vector machine.</p>
            </fn>
            <fn id="table3fn9">
              <p><sup>i</sup>KB: knowledge base.</p>
            </fn>
            <fn id="table3fn10">
              <p><sup>j</sup>MNM: memory neural network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Effects of Pretreatment Methods for Biomedical Entities</title>
        <p>As described later, there are 2 methods, one of which is the replacement method, that replace biomedical entities with uniform words. The second method is the addition method, which adds extra tags in the left and right sides of biomedical entities. We conducted experiments with the CDR, PPIm, DDI, and CPR datasets. The comparison of the 2 pretreatments for biomedical entities is shown in <xref ref-type="table" rid="table4">Table 4</xref>.</p>
        <p>For each dataset, the recall rate and F1 score obtained with our model with the replacement method were higher than obtained with our model with the addition method, especially for the CDR dataset. The reason is that biomedical entities are complicated, and most are compound words. For the pretrained self-attention structure, the word embeddings of biomedical entities are hard to learn from small biomedical datasets. As a consequence, replacing the target entities with uniform words is beneficial for the model to understand target entities and pay more attention in the context of target entities.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Comparison of 2 pretreatments (addition and replacement) for biomedical entities using our method.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="180"/>
            <col width="120"/>
            <col width="120"/>
            <col width="130"/>
            <col width="130"/>
            <col width="140"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Dataset, Types</td>
                <td colspan="3">Addition method</td>
                <td colspan="3">Replacement method</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>P<sup>a</sup>, %</td>
                <td>R<sup>b</sup>, %</td>
                <td>F1, %</td>
                <td>P, %</td>
                <td>R, %</td>
                <td>F1, %</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>CDR<sup>c</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Positive</td>
                <td>67.4</td>
                <td>54.8</td>
                <td>60.4</td>
                <td>65.5</td>
                <td>62.6</td>
                <td>64.0</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>PPIm<sup>d</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Positive</td>
                <td>79.3</td>
                <td>91.5</td>
                <td>84.8</td>
                <td>83.5</td>
                <td>90.4</td>
                <td>86.8</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>DDI<sup>e</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="5">
                  <break/>
                </td>
                <td>Int</td>
                <td>74.8</td>
                <td>46.2</td>
                <td>57.1</td>
                <td>76.2</td>
                <td>46.9</td>
                <td>58.0</td>
              </tr>
              <tr valign="top">
                <td>Advise</td>
                <td>87.2</td>
                <td>84.7</td>
                <td>85.9</td>
                <td>88.6</td>
                <td>89.0</td>
                <td>88.8</td>
              </tr>
              <tr valign="top">
                <td>Effect</td>
                <td>77.2</td>
                <td>82.1</td>
                <td>79.5</td>
                <td>77.0</td>
                <td>82.6</td>
                <td>79.7</td>
              </tr>
              <tr valign="top">
                <td>Mechanism</td>
                <td>84.8</td>
                <td>80.4</td>
                <td>82.5</td>
                <td>82.1</td>
                <td>86.0</td>
                <td>84.0</td>
              </tr>
              <tr valign="top">
                <td>All</td>
                <td>81.6</td>
                <td>78.6</td>
                <td>80.0</td>
                <td>81.2</td>
                <td>81.3</td>
                <td>81.4</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>CPR<sup>f</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="6">
                  <break/>
                </td>
                <td>CPR:3</td>
                <td>73.5</td>
                <td>80.3</td>
                <td>76.7</td>
                <td>75.4</td>
                <td>79.5</td>
                <td>77.4</td>
              </tr>
              <tr valign="top">
                <td>CPR:4</td>
                <td>84.4</td>
                <td>88.8</td>
                <td>86.6</td>
                <td>83.7</td>
                <td>90.4</td>
                <td>86.9</td>
              </tr>
              <tr valign="top">
                <td>CPR:5</td>
                <td>80.7</td>
                <td>82.0</td>
                <td>81.3</td>
                <td>81.2</td>
                <td>86.5</td>
                <td>83.7</td>
              </tr>
              <tr valign="top">
                <td>CPR:6</td>
                <td>84.0</td>
                <td>89.4</td>
                <td>86.7</td>
                <td>86.5</td>
                <td>88.2</td>
                <td>87.3</td>
              </tr>
              <tr valign="top">
                <td>CPR:9</td>
                <td>76.2</td>
                <td>86.9</td>
                <td>81.2</td>
                <td>79.5</td>
                <td>90.1</td>
                <td>84.5</td>
              </tr>
              <tr valign="top">
                <td>All</td>
                <td>80.4</td>
                <td>86.5</td>
                <td>83.3</td>
                <td>81.4</td>
                <td>87.9</td>
                <td>84.5</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>P: precision.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>R: recall.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>CDR: chemical-drug reaction.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>PPIm: protein-protein interaction affected by mutations.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>DDI: drug-drug interaction.</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>CPR: chemical-protein reaction.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparison of Different Pretrained Models</title>
        <p>BERT and SciBERT are the pretrained models that have the same self-attention structure. The difference between the two is that BERT is pretrained on the wiki corpus and SciBERT is pretrained on a large quantity of scientific papers from the computer science and biomedical domains. <xref ref-type="table" rid="table5">Table 5</xref> presents the comparison of BERT and SciBERT on 4 biomedical data sets. As shown by <xref ref-type="table" rid="table5">Table 5</xref>, SciBERT performs better than BERT, particularly with the F1 score, which was improved by 3.5% on the CDR data set. Therefore, the model pretrained on the biomedical corpus is beneficial for extracting biomedical relations.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Comparison of different pretrained models using our method.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="200"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="140"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Dataset, Type</td>
                <td colspan="6">BERT</td>
                <td colspan="5">SciBERT</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">P<sup>a</sup>, %</td>
                <td colspan="2">R<sup>b</sup>, %</td>
                <td colspan="2">F1, %</td>
                <td colspan="2">P, %</td>
                <td colspan="2">R, %</td>
                <td colspan="2">F1, %</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>CDR<sup>c</sup></bold>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Positive</td>
                <td colspan="2">62.9</td>
                <td colspan="2">58.3</td>
                <td colspan="2">60.5</td>
                <td colspan="2">65.5</td>
                <td colspan="2">62.6</td>
                <td>64.0</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>PPIm<sup>d</sup></bold>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Positive</td>
                <td colspan="2">79.0</td>
                <td colspan="2">92.2</td>
                <td colspan="2">85.1</td>
                <td colspan="2">83.5</td>
                <td colspan="2">90.4</td>
                <td>86.8</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>DDI<sup>e</sup></bold>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="5">
                  <break/>
                </td>
                <td colspan="2">Int</td>
                <td colspan="2">69.8</td>
                <td colspan="2">42.7</td>
                <td colspan="2">52.8</td>
                <td colspan="2">76.2</td>
                <td colspan="2">46.9</td>
                <td>58.0</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Advise</td>
                <td colspan="2">91.3</td>
                <td colspan="2">89.0</td>
                <td colspan="2">90.1</td>
                <td colspan="2">88.6</td>
                <td colspan="2">89.0</td>
                <td>88.8</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Effect</td>
                <td colspan="2">74.1</td>
                <td colspan="2">77.6</td>
                <td colspan="2">75.7</td>
                <td colspan="2">77.0</td>
                <td colspan="2">82.6</td>
                <td>79.7</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Mechanism</td>
                <td colspan="2">78.5</td>
                <td colspan="2">80.1</td>
                <td colspan="2">79.3</td>
                <td colspan="2">82.1</td>
                <td colspan="2">86.0</td>
                <td>84.0</td>
              </tr>
              <tr valign="top">
                <td colspan="2">All</td>
                <td colspan="2">79.0</td>
                <td colspan="2">77.5</td>
                <td colspan="2">78.2</td>
                <td colspan="2">81.2</td>
                <td colspan="2">81.3</td>
                <td>81.4</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>CPR<sup>f</sup></bold>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="6">
                  <break/>
                </td>
                <td colspan="2">CPR:3</td>
                <td colspan="2">73.8</td>
                <td colspan="2">76.5</td>
                <td colspan="2">75.1</td>
                <td colspan="2">75.4</td>
                <td colspan="2">79.5</td>
                <td>77.4</td>
              </tr>
              <tr valign="top">
                <td colspan="2">CPR:4</td>
                <td colspan="2">81.7</td>
                <td colspan="2">89.7</td>
                <td colspan="2">85.5</td>
                <td colspan="2">83.7</td>
                <td colspan="2">90.4</td>
                <td>86.9</td>
              </tr>
              <tr valign="top">
                <td colspan="2">CPR:5</td>
                <td colspan="2">79.3</td>
                <td colspan="2">80.7</td>
                <td colspan="2">79.9</td>
                <td colspan="2">81.2</td>
                <td colspan="2">86.5</td>
                <td>83.7</td>
              </tr>
              <tr valign="top">
                <td colspan="2">CPR:6</td>
                <td colspan="2">80.2</td>
                <td colspan="2">84.6</td>
                <td colspan="2">82.2</td>
                <td colspan="2">86.5</td>
                <td colspan="2">88.2</td>
                <td>87.3</td>
              </tr>
              <tr valign="top">
                <td colspan="2">CPR:9</td>
                <td colspan="2">76.5</td>
                <td colspan="2">88.2</td>
                <td colspan="2">81.9</td>
                <td colspan="2">79.5</td>
                <td colspan="2">90.1</td>
                <td>84.5</td>
              </tr>
              <tr valign="top">
                <td colspan="2">All</td>
                <td colspan="2">79.0</td>
                <td colspan="2">85.9</td>
                <td colspan="2">82.3</td>
                <td colspan="2">81.4</td>
                <td colspan="2">87.9</td>
                <td>84.5</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>P: precision.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>R: recall.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>CDR: chemical-drug reaction.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>PPIm: protein-protein interaction affected by mutations.</p>
            </fn>
            <fn id="table5fn5">
              <p><sup>e</sup>DDI: drug-drug interaction.</p>
            </fn>
            <fn id="table5fn6">
              <p><sup>f</sup>CPR: chemical-protein reaction.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Analysis of Each Component of the Method</title>
        <p>Data preprocessing (DP) and pretraining means (PTM) are important components of our method; DP aims to alleviate noise, and PTM is designed to solve the long-distance dependencies. We compared the importance of each component of our method with the CDR dataset. <xref ref-type="table" rid="table6">Table 6</xref> shows the changes in performance on the CDR dataset by removing DP and PRM. PTM resulted in a greater performance improvement than DP.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Performance changes by removing different parts of our model.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="280"/>
            <col width="140"/>
            <col width="150"/>
            <col width="170"/>
            <col width="260"/>
            <thead>
              <tr valign="top">
                <td>CDR<sup>a</sup> dataset</td>
                <td>P<sup>b</sup>, %</td>
                <td>R<sup>c</sup>, %</td>
                <td>F1, %</td>
                <td>Change, %</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Baseline</td>
                <td>65.5</td>
                <td>62.6</td>
                <td>64.0</td>
                <td>N/A</td>
              </tr>
              <tr valign="top">
                <td>Remove DP<sup>d</sup></td>
                <td>67.0</td>
                <td>54.3</td>
                <td>60.0</td>
                <td>–6.3</td>
              </tr>
              <tr valign="top">
                <td>Remove PTM<sup>e</sup></td>
                <td>46.1</td>
                <td>39.5</td>
                <td>42.6</td>
                <td>–33.4</td>
              </tr>
              <tr valign="top">
                <td>Remove DP and PTM</td>
                <td>48.9</td>
                <td>31.2</td>
                <td>38.1</td>
                <td>–40.5</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>CDR: chemical-drug reaction.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>P: precision.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>R: recall.</p>
            </fn>
            <fn id="table6fn4">
              <p><sup>d</sup>DP: data preprocessing.</p>
            </fn>
            <fn id="table6fn5">
              <p><sup>e</sup>PTM: pertraining means.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>To fully illustrate that our model can solve the problem of long-distance dependencies, we set 50 as the unit of instance length to count the number of positive and negative instances of the CDR test set, as shown in <xref ref-type="table" rid="table7">Table 7</xref>. As can be seen from the table, the instance length of the test sets is concentrated in the range of 50 to 300.</p>
        <p>We calculated the precision rate, recall rate, and accuracy rate of each interval length in the test set. The results are shown in <xref ref-type="table" rid="table8">Table 8</xref>. As can be seen in the table, the model has good performance when the instance length is longer than 100, except for the instances with lengths of 201 to 250. Therefore, our model can capture long-distance dependencies.</p>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Quantity distribution of the chemical-disease relation test set.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="390"/>
            <col width="200"/>
            <col width="210"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Interval length</td>
                <td>Positive</td>
                <td>Negative</td>
                <td>Sum</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>0-50</td>
                <td>70</td>
                <td>344</td>
                <td>414</td>
              </tr>
              <tr valign="top">
                <td>51-100</td>
                <td>181</td>
                <td>884</td>
                <td>1065</td>
              </tr>
              <tr valign="top">
                <td>101-150</td>
                <td>200</td>
                <td>845</td>
                <td>1045</td>
              </tr>
              <tr valign="top">
                <td>151-200</td>
                <td>160</td>
                <td>756</td>
                <td>916</td>
              </tr>
              <tr valign="top">
                <td>201-250</td>
                <td>158</td>
                <td>663</td>
                <td>821</td>
              </tr>
              <tr valign="top">
                <td>251-300</td>
                <td>177</td>
                <td>571</td>
                <td>748</td>
              </tr>
              <tr valign="top">
                <td>301-350</td>
                <td>56</td>
                <td>216</td>
                <td>272</td>
              </tr>
              <tr valign="top">
                <td>351-400</td>
                <td>34</td>
                <td>64</td>
                <td>98</td>
              </tr>
              <tr valign="top">
                <td>&#62;400</td>
                <td>6</td>
                <td>33</td>
                <td>39</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>Results of each interval length in the test set using our replacement method.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="390"/>
            <col width="200"/>
            <col width="210"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Interval length</td>
                <td>P<sup>a</sup>, %</td>
                <td>R<sup>b</sup>, %</td>
                <td>F1, %</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>0-50</td>
                <td>54.2</td>
                <td>64.3</td>
                <td>58.8</td>
              </tr>
              <tr valign="top">
                <td>51-100</td>
                <td>57.5</td>
                <td>57.5</td>
                <td>57.5</td>
              </tr>
              <tr valign="top">
                <td>101-150</td>
                <td>67.7</td>
                <td>64.0</td>
                <td>65.8</td>
              </tr>
              <tr valign="top">
                <td>151-200</td>
                <td>64.4</td>
                <td>71.2</td>
                <td>67.7</td>
              </tr>
              <tr valign="top">
                <td>201-250</td>
                <td>66.2</td>
                <td>54.4</td>
                <td>59.7</td>
              </tr>
              <tr valign="top">
                <td>251-300</td>
                <td>69.9</td>
                <td>69.5</td>
                <td>69.7</td>
              </tr>
              <tr valign="top">
                <td>301-350</td>
                <td>70.8</td>
                <td>60.7</td>
                <td>65.4</td>
              </tr>
              <tr valign="top">
                <td>351-400</td>
                <td>80.0</td>
                <td>82.4</td>
                <td>81.2</td>
              </tr>
              <tr valign="top">
                <td>&#62;400</td>
                <td>100.0</td>
                <td>66.7</td>
                <td>80.0</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table8fn1">
              <p><sup>a</sup>P: precision.</p>
            </fn>
            <fn id="table8fn2">
              <p><sup>b</sup>R: recall.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>To verify that the pretrained self-attention mechanism works as we believe, which is that it can take advantage of the textual context and capture very long-range dependencies to understand the complex semantics of biomedical text, we visualized the output of token “[CLS]” in the multihead of the final transformer stack, as shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>.</p>
        <p>As seen by the token colors, the token “[CLS]” is related to the following tokens: “chemical,” “disease,” “drug,” “related,” “bilateral,” “[CLS],” and “[SEP]”. The 12 different colors refer to the different head attentions. The more and darker the colors, the more relevant the token. Lines between two tokens denote a correlation between two tokens. Their clarity depends on the result of the head attentions.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Visualization of the output of token “[CLS]” in the multihead attention of the final transformer stack.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17644_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>From the perspective of semantic analysis, there are 2 places in this example reflecting a relationship between a disease and chemical: “Drug-related disease are most often associated with chemical.” and “Bilateral disease after the use of entity, without concurrent chemical use, have never been reported.” In the first sentence, the relation between chemical and disease is mainly determined by the following tokens: “associated,” “chemical,” “disease,” “related,” and “drug.” In the second sentence, the relation between chemical and disease is mainly determined by the following tokens: “reported,” “never,” “chemical,” “disease,” “without,” and “concurrent.” Token “[CLS]” is related to the most keywords in both sentences. Therefore, the pretrained self-attention structure can take advantage of the textual context and capture very long-range dependencies from document-level instances. On the other hand, the distribution of the different colors shows that multihead attention can form diverse representation subspaces to learn more complicated semantics.</p>
        <p>However, from the gradation of the colors, the relationship between token “[CLS]” and the keywords is not strong enough. Token “[CLS]” is not highly correlated with token “disease” in this instance. We visualized the output of tokens “chemical” and “disease” in the final multihead attention, as shown in <xref rid="figure5" ref-type="fig">Figures 5</xref> and <xref rid="figure6" ref-type="fig">6</xref>. As seen in these figures, the tokens “chemical” and “disease” in the sentences capture more local information, compared with the token “[CLS].” It may be inferred that, for document-level relation extraction in the final layer of the pretrained self-attention structure, designing a special network to capture the relationships between different target entities is better than applying a dense layer.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Visualization of the output of token “disease” in the final multihead attention.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17644_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Visualization of the output of token “chemical” in the final multihead attention.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17644_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>For a document-level annotated dataset, instead of dividing the dataset, we considered all target entity pairs as a whole and applied a pretrained self-attention structure to extract biomedical relations. The results and analysis show that the pretrained self-attention structure extracted relations of multiple entity pairs in a document. Through the visualization of the transformer, we verified that the pretrained self-attention structure can capture long-distance dependencies and learn complicated semantics. Furthermore, we conclude that replacement of biomedical entities benefits biomedical relation extraction, especially for document-level relation extraction.</p>
        <p>However, this method still has some issues. In future work, we plan to design a more effective network to capture local relations between biomedical entities and improve our method.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BRAN</term>
          <def>
            <p>bi-affine relation attention network.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CDR</term>
          <def>
            <p>chemical-disease relation.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CID</term>
          <def>
            <p>chemical-induced disease.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CNN</term>
          <def>
            <p>convolutional neural network.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CPR</term>
          <def>
            <p>chemical-protein relation.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">DDI</term>
          <def>
            <p>drug-drug interaction.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">DP</term>
          <def>
            <p>data preprocessing.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">FNN</term>
          <def>
            <p>feed-forward network.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">GCNN</term>
          <def>
            <p>graph convolutional neural network.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">KB</term>
          <def>
            <p>knowledge base.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">MNM</term>
          <def>
            <p>memory neural network.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">P</term>
          <def>
            <p>precision.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">PPIm</term>
          <def>
            <p>protein-protein interactions affected by mutations.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">PTM</term>
          <def>
            <p>pretraining means.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">R</term>
          <def>
            <p>recall.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">RNN</term>
          <def>
            <p>recurrent neural network.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">RPCNN</term>
          <def>
            <p>recurrent piecewise convolutional neural network.</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">SVM</term>
          <def>
            <p>support vector machine.</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was funded by the Natural Science Foundation of Guangdong Province of China (2015A030308017), National Natural Science Foundation of China (61976239), and Innovation Foundation of High-end Scientific Research Institutions of Zhongshan City of China (2019AG031).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Law</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Knox</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Djoumbou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jewison</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Maciejewski</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Arndt</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Neveu</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gabriel</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ly</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Adamjee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dame</surname>
              <given-names>ZT</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wishart</surname>
              <given-names>DS</given-names>
            </name>
          </person-group>
          <article-title>DrugBank 4.0: shedding new light on drug metabolism</article-title>
          <source>Nucleic Acids Res</source>
          <year>2014</year>
          <month>01</month>
          <volume>42</volume>
          <issue>Database issue</issue>
          <fpage>D1091</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://nar.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=24203711"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkt1068</pub-id>
          <pub-id pub-id-type="medline">24203711</pub-id>
          <pub-id pub-id-type="pii">gkt1068</pub-id>
          <pub-id pub-id-type="pmcid">PMC3965102</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krallinger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rabal</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Akhondi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Overview of the BioCreative VI chemical-protein interaction Track</article-title>
          <year>2017</year>
          <month>10</month>
          <conf-name>In: Proceedings of the sixth BioCreative challenge evaluation workshop</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Bethesda, Maryland</conf-loc>
          <fpage>141</fpage>
          <lpage>146</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Sciaky</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Leaman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Mattingly</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Wiegers</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>BioCreative V CDR task corpus: a resource for chemical disease relation extraction</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <volume>2016</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw068"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw068</pub-id>
          <pub-id pub-id-type="medline">27161011</pub-id>
          <pub-id pub-id-type="pii">baw068</pub-id>
          <pub-id pub-id-type="pmcid">PMC4860626</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dumontier</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Drug-drug interaction extraction via hierarchical RNNs on sequence and shortest dependency paths</article-title>
          <source>Bioinformatics</source>
          <year>2018</year>
          <month>03</month>
          <day>01</day>
          <volume>34</volume>
          <issue>5</issue>
          <fpage>828</fpage>
          <lpage>835</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29077847"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btx659</pub-id>
          <pub-id pub-id-type="medline">29077847</pub-id>
          <pub-id pub-id-type="pii">4565590</pub-id>
          <pub-id pub-id-type="pmcid">PMC6030919</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Hierarchical Recurrent Convolutional Neural Network for Chemical-protein Relation Extraction from Biomedical Literature</article-title>
          <year>2018</year>
          <month>03</month>
          <conf-name>IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name>
          <conf-date>December 2018</conf-date>
          <conf-loc>Madrid, Spain</conf-loc>
          <fpage>3</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1109/bibm.2018.8621159</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Chemical-gene relation extraction using recursive neural network</article-title>
          <source>Database (Oxford)</source>
          <year>2018</year>
          <month>01</month>
          <day>01</day>
          <volume>2018</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bay060"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bay060</pub-id>
          <pub-id pub-id-type="medline">29961818</pub-id>
          <pub-id pub-id-type="pii">5042822</pub-id>
          <pub-id pub-id-type="pmcid">PMC6014134</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rios</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kavuluru</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Extracting chemical-protein relations with ensembles of SVM and deep learning models</article-title>
          <source>Database (Oxford)</source>
          <year>2018</year>
          <month>01</month>
          <day>01</day>
          <volume>2018</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bay073"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bay073</pub-id>
          <pub-id pub-id-type="medline">30020437</pub-id>
          <pub-id pub-id-type="pii">5055578</pub-id>
          <pub-id pub-id-type="pmcid">PMC6051439</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Chemical-induced disease relation extraction via convolutional neural network</article-title>
          <source>Database (Oxford)</source>
          <year>2017</year>
          <month>01</month>
          <day>01</day>
          <volume>2017</volume>
          <issue>1</issue>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bax024"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bax024</pub-id>
          <pub-id pub-id-type="medline">28415073</pub-id>
          <pub-id pub-id-type="pii">3098440</pub-id>
          <pub-id pub-id-type="pmcid">PMC5467558</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Exploiting syntactic and semantics information for chemical-disease relation extraction</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <volume>2016</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw048"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw048</pub-id>
          <pub-id pub-id-type="medline">27081156</pub-id>
          <pub-id pub-id-type="pii">baw048</pub-id>
          <pub-id pub-id-type="pmcid">PMC4831723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>NY</given-names>
            </name>
            <name name-style="western">
              <surname>Hoifung</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chris</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Cross-sentence n-ary relation extraction with Graph LSTMs</article-title>
          <source>Transactions of the Association for Computational Linguistics ?</source>
          <year>2017</year>
          <volume>5</volume>
          <fpage>101</fpage>
          <lpage>115</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gildea</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>N-ary relation extraction using graph-state LSTM</article-title>
          <year>2018</year>
          <month>10</month>
          <conf-name>Proceedings of the  Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sunil</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fenia</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Makoto</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Inter-sentence relation extraction with document-level graph convolutional neural network</article-title>
          <year>2019</year>
          <month>06</month>
          <conf-name>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics;  July; Florence, Italy</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Italy</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/p19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Chemical-induced disease extraction via recurrent piecewise convolutional neural networks</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2018</year>
          <month>07</month>
          <day>23</day>
          <volume>18</volume>
          <issue>Suppl 2</issue>
          <fpage>60</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-018-0629-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-018-0629-3</pub-id>
          <pub-id pub-id-type="medline">30066652</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-018-0629-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC6069297</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Verga</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Strubell</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>McCallum</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Simultaneously Self-Attending to All Mentions for Full-Abstract Biological Relation Extraction</article-title>
          <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)</source>
          <year>2018</year>
          <conf-name>16th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 6, 2018</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>872</fpage>
          <lpage>884</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1080</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Leaman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Mattingly</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wiegers</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Assessing the state of the art in biomedical relation extraction: overview of the BioCreative V chemical-disease relation (CDR) task</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <volume>2016</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw032"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw032</pub-id>
          <pub-id pub-id-type="medline">26994911</pub-id>
          <pub-id pub-id-type="pii">baw032</pub-id>
          <pub-id pub-id-type="pmcid">PMC4799720</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Chemical-induced disease relation extraction with various linguistic features</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <volume>2016</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw042"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw042</pub-id>
          <pub-id pub-id-type="medline">27052618</pub-id>
          <pub-id pub-id-type="pii">baw042</pub-id>
          <pub-id pub-id-type="pmcid">PMC4822558</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Panyam</surname>
              <given-names>NC</given-names>
            </name>
            <name name-style="western">
              <surname>Verspoor</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cohn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ramamohanarao</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Exploiting graph kernels for high performance biomedical relation extraction</article-title>
          <source>J Biomed Semantics</source>
          <year>2018</year>
          <month>01</month>
          <day>30</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>7</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jbiomedsem.biomedcentral.com/articles/10.1186/s13326-017-0168-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13326-017-0168-3</pub-id>
          <pub-id pub-id-type="medline">29382397</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13326-017-0168-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC5791373</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stratas</surname>
              <given-names>NE</given-names>
            </name>
          </person-group>
          <article-title>A double-blind study of the efficacy and safety of dothiepin hydrochloride in the treatment of major depressive disorder</article-title>
          <source>J Clin Psychiatry</source>
          <year>1984</year>
          <month>11</month>
          <volume>45</volume>
          <issue>11</issue>
          <fpage>466</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="medline">6386793</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beltagy</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cohan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>SciBERT: A Pre-trained Language Model for scientific text</article-title>
          <year>2019</year>
          <month>11</month>
          <conf-name>2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Hong Kong</conf-loc>
          <publisher-loc>SciBERT</publisher-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1371</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>Computation and Language</source>
          <year>2019</year>
          <month>5</month>
          <fpage>2019</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rios</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sennrich</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Why self-attention? a targeted evaluation of neural machine translation architectures</article-title>
          <year>2018</year>
          <month>11</month>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ward</surname>
              <given-names>DI</given-names>
            </name>
          </person-group>
          <article-title>Two cases of amisulpride overdose: a cause for prolonged QT syndrome</article-title>
          <source>Emerg Med Australas</source>
          <year>2005</year>
          <month>06</month>
          <volume>17</volume>
          <issue>3</issue>
          <fpage>274</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1742-6723.2005.00734.x</pub-id>
          <pub-id pub-id-type="medline">15953230</pub-id>
          <pub-id pub-id-type="pii">EMM734</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mike</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Google's neural machine translation system: Bridging the gap between human and machine translation</article-title>
          <source>arXiv preprint arXiv.08144</source>
          <year>2016</year>
          <month>10</month>
          <fpage>2016</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dogan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chatr-aryamontri</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>BioCreative VI Precision Medicine Track: creating a training corpus for mining protein-protein interactions affected by mutations</article-title>
          <year>2017</year>
          <month>08</month>
          <conf-name>=BioNLP 2017 workshop</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w17-2321</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Multichannel Convolutional Neural Network for Biological Relation Extraction</article-title>
          <source>Biomed Res Int</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>1850404</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1155/2016/1850404"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2016/1850404</pub-id>
          <pub-id pub-id-type="medline">28053977</pub-id>
          <pub-id pub-id-type="pmcid">PMC5174749</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Soldaini</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cohan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Goharian</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Relation Extraction for Protein-protein Interactions Affected by Mutations</article-title>
          <year>2018</year>
          <month>08</month>
          <conf-name>BCB '18th ACM International Conference on Bioinformatics, Computational Biology Health Informatics</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Washington DC USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3233547.3233617</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kavuluru</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>An end-to-end deep learning architecture for extracting protein-protein interactions affected by genetic mutations</article-title>
          <source>Database (Oxford)</source>
          <year>2018</year>
          <month>01</month>
          <day>01</day>
          <volume>2018</volume>
          <fpage>1</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bay092"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bay092</pub-id>
          <pub-id pub-id-type="medline">30239680</pub-id>
          <pub-id pub-id-type="pii">5096687</pub-id>
          <pub-id pub-id-type="pmcid">PMC6146129</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ning</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Leveraging prior knowledge for protein-protein interaction extraction with memory network</article-title>
          <source>Database (Oxford)</source>
          <year>2018</year>
          <month>01</month>
          <day>01</day>
          <volume>2018</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bay071"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bay071</pub-id>
          <pub-id pub-id-type="medline">30010731</pub-id>
          <pub-id pub-id-type="pii">5053999</pub-id>
          <pub-id pub-id-type="pmcid">PMC6047414</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
