<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i7e17638</article-id>
      <article-id pub-id-type="pmid">32459636</article-id>
      <article-id pub-id-type="doi">10.2196/17638</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Document-Level Biomedical Relation Extraction Using Graph Convolutional Network and Multihead Attention: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Hao</surname>
            <given-names>Tianyong</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Tang</surname>
            <given-names>Buzhou</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Huang</surname>
            <given-names>Zhengxing</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gabashvili</surname>
            <given-names>Irene</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>Linfeng</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Jian</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4656-7446</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Xiaoyu</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0426-8920</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Yu</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2012-226X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Yijia</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Computer Science and Technology</institution>
            <institution>Dalian University of Technology</institution>
            <addr-line>No. 2 Linggong Road, Ganjingzi District</addr-line>
            <addr-line>Dalian, 116023</addr-line>
            <country>China</country>
            <phone>86 0411 84708498</phone>
            <email>zhyj@dlut.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5843-4675</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Wen</surname>
            <given-names>Jiabin</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2897-5590</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Hongfei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0872-7688</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Zhihao</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6186-2024</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Xin</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8800-912X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Computer Science and Technology</institution>
        <institution>Dalian University of Technology</institution>
        <addr-line>Dalian</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of VIP</institution>
        <institution>The Second Hospital of Dalian Medical University</institution>
        <addr-line>Dalian</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yijia Zhang <email>zhyj@dlut.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>31</day>
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>7</issue>
      <elocation-id>e17638</elocation-id>
      <history>
        <date date-type="received">
          <day>30</day>
          <month>12</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>1</day>
          <month>3</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>14</day>
          <month>4</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>4</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Jian Wang, Xiaoyu Chen, Yu Zhang, Yijia Zhang, Jiabin Wen, Hongfei Lin, Zhihao Yang, Xin Wang. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 31.07.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2020/7/e17638" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Automatically extracting relations between chemicals and diseases plays an important role in biomedical text mining. Chemical-disease relation (CDR) extraction aims at extracting complex semantic relationships between entities in documents, which contain intrasentence and intersentence relations. Most previous methods did not consider dependency syntactic information across the sentences, which are very valuable for the relations extraction task, in particular, for extracting the intersentence relations accurately.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this paper, we propose a novel end-to-end neural network based on the graph convolutional network (GCN) and multihead attention, which makes use of the dependency syntactic information across the sentences to improve CDR extraction task.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>To improve the performance of intersentence relation extraction, we constructed a document-level dependency graph to capture the dependency syntactic information across sentences. GCN is applied to capture the feature representation of the document-level dependency graph. The multihead attention mechanism is employed to learn the relatively important context features from different semantic subspaces. To enhance the input representation, the deep context representation is used in our model instead of traditional word embedding.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We evaluate our method on CDR corpus. The experimental results show that our method achieves an F-measure of 63.5%, which is superior to other state-of-the-art methods. In the intrasentence level, our method achieves a precision, recall, and F-measure of 59.1%, 81.5%, and 68.5%, respectively. In the intersentence level, our method achieves a precision, recall, and F-measure of 47.8%, 52.2%, and 49.9%, respectively.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The GCN model can effectively exploit the across sentence dependency information to improve the performance of intersentence CDR extraction. Both the deep context representation and multihead attention are helpful in the CDR extraction task.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>biomedical relation extraction</kwd>
        <kwd>dependency graph</kwd>
        <kwd>multihead attention</kwd>
        <kwd>graph convolutional network</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Valuable biomedical information and knowledge are still hidden in the exponentially increasing biomedical literature, such as the chemical-disease relation (CDR). Extracting the relation between chemicals and diseases is an important task in biomedical text mining, which plays an important role in various biomedical research studies, such as clinical treatment, drug development, and biomedical knowledge discovery [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. However, extracting CDR from the biomedical literature manually is time-consuming and difficult to keep up-to-date. Thus, the BioCreative V community [<xref ref-type="bibr" rid="ref4">4</xref>] proposed a task of extracting CDR in the biomedical literature automatically to promote the research on the CDR extraction.</p>
      <p>To date, many methods have been proposed for automatic relation extraction between chemicals and diseases, which can be divided into 3 categories: rule-based methods [<xref ref-type="bibr" rid="ref5">5</xref>], feature-based methods [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>], and deep neural network-based methods [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Rule-based methods aim to formulate the heuristic rules for CDR extraction. Lowe et al [<xref ref-type="bibr" rid="ref5">5</xref>] developed a pattern-based system with some heuristic rules to extract chemical-induced disease (CID) relations within the same sentence. The heuristic rules are used to extract the most likely CID relations when no patterns match a document. Generally, rule-based methods are simple and effective. However, these methods are difficult for application in a new task or dataset. Feature-based methods aim at designing rich features, including semantic and syntactic information. Xu et al [<xref ref-type="bibr" rid="ref6">6</xref>] utilized text features, including context information and entity information, incorporated with domain knowledge to extract CID relations. Since the syntactic information carried in the dependency graph of the sentence is crucial to CDR extraction, some studies also developed syntactic features. Gu et al [<xref ref-type="bibr" rid="ref7">7</xref>] utilized various linguistic features to extract CID relations with the maximum entropy model. They leveraged lexical features for both intrasentence and intersentence level relation extraction and developed the dependency features only for intrasentence level relation extraction. Zhou et al [<xref ref-type="bibr" rid="ref8">8</xref>] utilized the shortest dependency path between chemical and disease entities to extract structured syntactic features. Feature-based methods achieve better performance than rule-based methods. However, traditional feature-based methods only use the dependency trees to extract local syntactic dependencies for the intrasentence level relation extraction, without considering the syntactic dependencies across sentences for the document-level relation extraction. Besides, designing rich features is a time-consuming and laborious task.</p>
      <p>In recent years, the deep neural network has been widely used in various natural language processing (NLP) tasks. Some studies have developed deep neural network-based methods for biomedical relation extraction. Long short-term memory (LSTM) models and convolutional neural network (CNN) models are the 2 major neural networks. Zhou et al [<xref ref-type="bibr" rid="ref10">10</xref>] applied LSTM and CNN models based on traditional word embedding to capture context features for CDR extraction and achieve a good performance. Gu et al [<xref ref-type="bibr" rid="ref11">11</xref>] proposed a CNN-based model to capture context and dependency features for intrasentence level relation extraction. Nguyen and Verspoor [<xref ref-type="bibr" rid="ref13">13</xref>] investigated character-based word embedding into the CNN-based relation extraction model. Traditional word embedding such as word2vec cannot vary according to linguistic contexts effectively. Peters et al [<xref ref-type="bibr" rid="ref14">14</xref>] proposed deep contextualized word representations called ELMo based on a deep bidirectional language model. ELMo can generate a more comprehensive representation for each word based on the sentence context. Therefore, integrating ELMo with a deep neural network may improve the performance of CDR extraction.</p>
      <p>In both CNN-based and LSTM-based models, it is hard to distinguish the relevant and irrelevant context features for the relation extraction. A recent study [<xref ref-type="bibr" rid="ref15">15</xref>] suggested that attention mechanism can capture the most important semantic information for the relation extraction. Vaswani et al [<xref ref-type="bibr" rid="ref16">16</xref>] introduced a multihead attention mechanism that applied the self-attention mechanism multiple times to capture the relatively important features from different representation subspaces. Thus, multihead attention mechanism can be used to improve the performance of the CDR extraction.</p>
      <p>Dependency trees are often used to extract local dependencies for intrasentence level CDR extraction. However, existing studies ignored the nonlocal dependency across sentences, which is crucial for intersentence level CDR extraction. Quirk et al [<xref ref-type="bibr" rid="ref17">17</xref>] introduced a document graph that can derive features within and across sentences. Thus, we also constructed a document-level dependency graph that can extract dependencies for intrasentence and intersentence level CDR extraction simultaneously. Recently, the graph convolution network (GCN) [<xref ref-type="bibr" rid="ref18">18</xref>] has been effectively used for encoding document graph information. Thus, GCN can operate directly on the document-level dependency graph to capture long-range syntactic information, which is useful for CDR extraction.</p>
      <p>In this study, we evaluated the effectiveness of the deep contextualized word representations, multihead attention mechanism, and GCN in the CDR extraction task. To improve the performance of the intersentence relation extraction, we constructed the document-level dependency graph to capture the dependency syntactic information across sentences. Based on the document-level dependency graph, we proposed a novel end-to-end model to extract CID relations from the biomedical literature. First, we used ELMo, POS embedding, and position embedding to construct the input representation and employed the multihead attention with bidirectional LSTM (BiLSTM) to capture the relatively important context features. Second, we employed the GCN to capture the long-range dependency features based on the document-level dependency graph. Third, we combined the context features and long-range dependency features as the final feature representation and applied a <italic>Softmax</italic> function to implement relation classification. Finally, we evaluated our model on the CDR corpus.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>CDR Extraction</title>
        <p>The CDR extraction task is a challenging task, which was proposed by the BioCreative V community. The CDR extraction task aims to extract CDR from the biomedical literature automatically and accurately. It is composed of 2 subtasks: (1) disease named entity recognition and normalization and (2) CID relation extraction.</p>
        <p>In this study, we focused on the CID relation extraction task. The CDR extraction task is a document-level biomedical relation extraction problem, which is different from traditional biomedical relation extraction task. Traditional biomedical relation extraction only considers relation within a single sentence such as protein-protein interaction [<xref ref-type="bibr" rid="ref19">19</xref>] and drug-drug interaction [<xref ref-type="bibr" rid="ref20">20</xref>]. However, the CID relation is not only expressed within a single sentence, but it is also expressed across several sentences. <xref rid="figure1" ref-type="fig">Figure 1</xref> shows an illustration of CDR extraction. It is extracted from the CDR corpus whose PMID is 6203632. Among these sentences, the texts in bold mention the chemical and disease entities. In <xref rid="figure1" ref-type="fig">Figure 1</xref>, we mark the corresponding entity type and the medical subject headings concept identifiers [<xref ref-type="bibr" rid="ref21">21</xref>] after the entity mention in the sentence. The chemical D007545 has 2 intrasentence level co-occurrences with disease D006332 in the <italic>sentence 1</italic> and the <italic>sentence 2</italic>, while it has an intersentence level co-occurrence with disease D006965. However, not all occurrences of the chemicals and diseases are considered as a CID relation. For example, the chemical D007545 does not have a CID relation with the disease D006984 in the <italic>sentence 4</italic> because the concept of the disease D006984 is too general to reflect a CID relation.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Illustrative examples of CID relation. CID: chemical-induced disease.</p>
          </caption>
          <graphic xlink:href="medinform_v8i7e17638_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Relation Instance Construction</title>
        <p>First, we should construct relation instances for both training and testing stages. All the instances generated from the disease and chemical mentions in the document are pooled into 2 groups at the intrasentence and intersentence levels, respectively. The former means that a chemical-disease mention pair is in the same sentence. The latter means that a mention pair is in a different sentence. If the relation between the chemical and disease entity of the mentioned pair is annotated as a CID relation in the document, then this mentioned pair is constructed as a positive instance; otherwise, this mentioned pair is constructed as a negative instance. We applied several effective heuristic rules for both intrasentence and intersentence level instances. The details are as follows.</p>
        <sec>
          <title>Relation Instance Construction for Intrasentence Level</title>
          <list list-type="order">
            <list-item>
              <p>All chemical-disease entity mention pairs that appear in the same sentence are constructed as intrasentence level instances.</p>
            </list-item>
            <list-item>
              <p>If multiple mentions refer to the same entity in a sentence, the mentions in the nearest distance should be constructed as an instance.</p>
            </list-item>
            <list-item>
              <p>For instance, chemical D007545 and disease D006332 in <italic>sentence 1</italic> form an intrasentence level positive instance, while chemical D007545 and disease D006984 in <italic>sentence 4</italic> form an intrasentence level negative instance.</p>
            </list-item>
          </list>
        </sec>
        <sec>
          <title>Relation Instance Construction for Intersentence Level</title>
          <list list-type="order">
            <list-item>
              <p>Only the chemical-disease entity pairs that are not involved in any intrasentence level are considered as intersentence level instances.</p>
            </list-item>
            <list-item>
              <p>If multiple mentions refer to the same entity, the chemical and disease mention in the nearest distance are chosen.</p>
            </list-item>
          </list>
          <p>According to our heuristic rules, chemical D007545 in <italic>sentence 4</italic> and disease D006965 in <italic>sentence 5</italic> are regarded as an intersentence level instance because there are no mentions of them in the same sentence. Chemical D007545 in <italic>sentence 1</italic> and disease D006965 in <italic>sentence 5</italic> will be omitted because their distance is not the shortest. Further, chemical D007545 in <italic>sentence 4</italic> and disease D006984 in <italic>sentence 5</italic> are not regarded as an intersentence level instance because chemical D007545 already has intrasentence level co-occurrence with disease D006984 in <italic>sentence 4</italic>.</p>
        </sec>
      </sec>
      <sec>
        <title>Document-Level Dependency Graph</title>
        <p>To generate features for entity pairs within and across sentences, we introduce a document-level dependency graph with nodes representing words and edges that show intrasentence and intersentence dependency relations. <xref rid="figure2" ref-type="fig">Figure 2</xref> shows an example of document-level dependency graph for 2 sentences. In this study, we use the following 3 types of intrasentence and intersentence dependency edges.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>An example of a document-level dependency graph for 2 sentences expressing a CID relation. The chemical and disease entity mention is highlighted in bold. For simplicity, we have omitted self-node edges. CID: chemical-induced disease.</p>
          </caption>
          <graphic xlink:href="medinform_v8i7e17638_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <list list-type="order">
          <list-item>
            <p>Syntactic dependency edge: The syntactic structure is crucial to biomedical relation extraction. Hence, we use syntactic dependency edges derived from Stanford dependency syntactic parser as intrasentential edges. For instance, “conj” denotes the syntactic relation between the word “stopped” and “followed” in the same sentence.</p>
          </list-item>
          <list-item>
            <p>Adjacent sentence edge: Dependencies between sentences are useful for document-level relation extraction. Thus, we consider the sentence as a node in a type of discourse dependency tree. Moreover, we added an edge between the dependency roots of adjacent sentences as an intersentential edge, which is a simple but an effective approach. For instance, “next” denotes the syntactic relation between 2 sentences.</p>
          </list-item>
          <list-item>
            <p>Self-node edge: We added self-node edges to all the nodes of the graph in order to enable GCN to not only learn information based on neighbor nodes but also learn the node information itself.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Model Architecture</title>
        <p>The schematic overview of our model is shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. In short, our model mainly consists of 4 parts: the input representation layer, the BiLSTM layer, the multihead attention layer, and the GCN layer. The inputs of our model are text sequences. The input layer will generate a deep contextualized word representation for each word. Recent studies [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>] have suggested that the part of speech (POS) and the position of each word are useful for biomedical relation extraction. Hence, we concatenate the deep contextualized word representation and POS and position embedding as the whole word representation. The BiLSTM layer will obtain contextual features from the word representation. The multihead attention layer will apply the self-attention mechanism multiple times to capture the relative semantic features from different representation subspaces. The GCN layer will operate over the document-level dependency graph to capture long-range syntactic features. We employed max pooling over the outputs of the multihead attention layer and the GCN layer and then concatenated these 2 vectors as the final representation. Finally, we employed a fully connected layer and the <italic>Softmax</italic> function to identify the CID relation. Our model will be described in detail in the following section.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Overview of our model. The input representation consists of  ELMo, POS embedding, and position embedding. In the multi-head self-attention layer, we only show the detailed self-attention computation for the word “administration.” In the GCN layer, we only show the detailed graph convolution computation for the word “administration.” BiLSTM: bidirectional long short-term memory; POS: part of speech; GCN: graph convolutional network.</p>
          </caption>
          <graphic xlink:href="medinform_v8i7e17638_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Input Representation</title>
          <p>We used ELMo instead of the traditional word representation in our model. Traditional word representation generates a fixed representation vector for the same word. However, ELMo is the function of the entire input sentence based on a bidirectional language model so that it can generate different representation vectors for the same word according to the different sentence context.</p>
          <p>Given that a sequence {<italic>t<sub>1</sub>, t<sub>2</sub>,…..t<sub>N</sub></italic>} denotes the word tokens in a sentence <italic>S</italic>. Given a token <italic>t<sub>k</sub></italic>, the forward language model calculates the probability of the token <italic>t<sub>k</sub></italic> based on the previous tokens {<italic>t<sub>1</sub></italic>, <italic>t<sub>2,...,</sub> t<sub>(k-1)</sub></italic>} of <italic>t<sub>k</sub></italic> in the sentence <italic>S</italic> as follows:</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
(1)</p>
          <p>Similarly, the backward language model calculates the probability of the token <italic>t<sub>k</sub></italic> based on the back tokens {<italic>t<sub>1</sub></italic>, <italic>t<sub>2, …,</sub> t<sub>(k-1)</sub></italic>} of <italic>t<sub>k</sub></italic> in the sentence <italic>S</italic> as follows:</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (2)</p>
          <p>Combining the forward and the backward language models as a bidirectional language model, the log-likelihood can be maximized as follows:</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>(3)</p>
          <p>ELMo can represent the semantic and syntactic information of the word. In our model, we use a linear combination of the hidden state in each layer of the bidirectional language model to generate a deep contextualized representation for words. The POS and the position information of a word are crucial to biomedical relation extraction. Therefore, we also utilize POS embedding and position embedding to enhance the representation ability of the input. The POS embedding represents the POS feature of a word, and the position embedding reflects the relative distance between the word and the target entity. Given a word at position <italic>i,</italic> we obtain its POS embedding <italic>w<sub>p,i</sub></italic> and position embedding <italic>w<sub>d,i</sub></italic> based on mapping matrixes <italic>M<sub>p</sub></italic> and <italic>M<sub>d</sub></italic>, respectively. Finally, the whole word representations concatenate deep contextualized word representations, POS embedding, and position embedding as follows:</p>
          <disp-formula><italic>wi=</italic>[<italic>we,i; wp,i; wd,i</italic>] (4)
          </disp-formula>
        </sec>
        <sec>
          <title>BiLSTM</title>
          <p>The LSTM model is a variant of recurrent neural network models that has been used in many NLP tasks successfully. The LSTM model overcomes the vanishing gradient problem by introducing a gating mechanism [<xref ref-type="bibr" rid="ref24">24</xref>]. Therefore, it is suitable to capture the long-term dependency feature. The LSTM unit consists of 3 components: the input gate <italic>i<sub>t</sub></italic>, the forget gate <italic>f<sub>t</sub></italic>, and the output gate <italic>o<sub>t</sub></italic>. At the time step <italic>t</italic>, the LSTM unit utilizes the input word <italic>x<sub>t</sub></italic>, the previous hidden state <italic>h<sub>(t-1)</sub></italic>, and the previous cell state <italic>c<sub>(t-1)</sub></italic> to calculate the current hidden state <italic>h<sub>t</sub></italic> and cell state <italic>c<sub>t</sub></italic>. The equations are as follows:</p>
          <disp-formula><italic>ft=σ(Wfxt+Ufh(t-1)+bf)</italic> (5)
          </disp-formula>
          <disp-formula><italic>ot=σ(Woxt+Uoh(t-1)+bo)</italic> (6)
          </disp-formula>
          <disp-formula><italic>gt=tanh(Wgxt+Ugh(t-1)+bg)</italic> (7)
          </disp-formula>
          <disp-formula><italic>it=σ(Wixt+Uih(t-1)+bi)</italic> (8)
          </disp-formula>
          <disp-formula><italic>c<sub>t</sub></italic>=<italic>f<sub>t</sub></italic>⊙<italic>c<sub>(t-1)</sub></italic>+ <italic>i<sub>t</sub></italic>⊙<italic>g<sub>t</sub></italic> (9)</disp-formula>
          <disp-formula><italic>h<sub>t</sub></italic>= <italic>o<sub>t</sub></italic>⊙<italic>tanh</italic>(<italic>c<sub>t</sub></italic>) (10)</disp-formula>
          <p>where <italic>W, U, b</italic> are the weight and bias parameters, and ⊙ denotes element-wise multiplication. In this study, we use the BiLSTM model that can capture the forward and backward context features simultaneously. The BiLSTM model combines a forward LSTM and a backward LSTM. Given the hidden state of the forward LSTM <inline-graphic xlink:href="medinform_v8i7e17638_fig8.png" xlink:type="simple" mimetype="image"/> and the hidden state of the backward LSTM <inline-graphic xlink:href="medinform_v8i7e17638_fig9.png" xlink:type="simple" mimetype="image"/>, the final hidden state is concatenated as: <graphic xlink:href="medinform_v8i7e17638_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/></p>
        </sec>
        <sec>
          <title>Multihead Attention</title>
          <p>The BiLSTM model learns the context features from the input sequences automatically and effectively. However, these features make different contributions to the biomedical relation extraction. In our model, we capture the relatively important features by introducing multihead attention mechanism. The essence of multihead attention is applying self-attention mechanism multiple times so that it may let the model learn the relatively important features from different representation subspaces. The self-attention mechanism generates the output based on a query and a set of key-value pairs. The output is the weighted sum of the values, where the weight assigned to each value is computed by applying attention function to the query with the corresponding key. In our study, we deal with the output of the BiLSTM model by multihead self-attention. Further, we use the dot-product attention function instead of the standard additive attention function [<xref ref-type="bibr" rid="ref25">25</xref>] as follows:</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (11),</p>
          <p>where <italic>Q, K, V∈R<sup>n</sup></italic> represent query, key, and value matrixes, respectively. <italic>d</italic> is the dimension of the output of the BiLSTM model.</p>
          <p>The main idea of the multihead attention is applying the self-attention mechanism multiple times. If the multihead attention contains h heads, the <italic>i</italic>-th attention head can be calculated as <italic>head<sub>i</sub>=Attention</italic> (<italic>Q<sub>i</sub>,K<sub>i</sub>,V<sub>i</sub></italic>). Thus, the final multihead attention is the concatenation of {<italic>head<sub>1,</sub>head<sub>2,</sub>...,head<sub>h</sub></italic>} as <italic>MultiHead</italic> (<italic>Q,K,V</italic>)<italic>=Concat</italic> (<italic>head<sub>1,</sub>head<sub>2,</sub>...,head<sub>h</sub></italic>) <italic>W<sup>o</sup></italic>. The output of the multihead attention layer is a matrix of <italic>R<sup>nat</sup></italic>.</p>
        </sec>
        <sec>
          <title>GCN</title>
          <p>GCN is an adaptation of CNN [<xref ref-type="bibr" rid="ref26">26</xref>], which operates on graphs. Given a graph with n nodes, the graph structure can be represented as an adjacency matrix <italic>A</italic>. In this study, we converted the document-level dependency graph into its corresponding adjacency matrix <italic>A</italic>, where <italic>A<sub>ij</sub>=</italic>1 if there is a dependency edge going from token <italic>i</italic> to token <italic>j</italic>; otherwise <italic>A<sub>ij</sub>=</italic>0. The dependency graph can be calculated as an undirected graph [<xref ref-type="bibr" rid="ref27">27</xref>], which means <italic>A<sub>ij</sub>=A<sub>ji</sub></italic>. Further, we add a self-node edge to all the nodes in the graph, which means <italic>A<sub>ii</sub></italic>=1. Since the degree of a node in the dependency graph varies a lot, this may bias the output representation toward favoring high-degree nodes, regardless of the information carried in the node. To solve this issue, we normalize the activations in the graph convolution before feeding it through the nonlinearity. Finally, the graph convolution operation for node <italic>i</italic> at the <italic>l</italic>-th layer where <inline-graphic xlink:href="medinform_v8i7e17638_fig12.png" xlink:type="simple" mimetype="image"/> and <inline-graphic xlink:href="medinform_v8i7e17638_fig13.png" xlink:type="simple" mimetype="image"/> denote the input representation and the output representation of node can be defined as follows:</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (12),</p>
          <p>where <italic>W<sup>(l)</sup></italic> is the weight matrix, <italic>b<sup>(l)</sup></italic> is the bias vector, <graphic xlink:href="medinform_v8i7e17638_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> is the degree of node <italic>i</italic> in the dependency graph, and <italic>ρ</italic> is an activation function (eg, a rectified linear unit).</p>
          <p>The GCN model takes the output of the BiLSTM model as the input word representation: <graphic xlink:href="medinform_v8i7e17638_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> Then, we stack the graph convolution operation over layers and obtain <graphic xlink:href="medinform_v8i7e17638_fig17.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> as the output word representations of the GCN model. Note that the GCN model presented above uses the same parameters for all edges in the dependency graph.</p>
        </sec>
        <sec>
          <title>Relation Classification</title>
          <p>To make use of the output word representation of the GCN model for relation extraction, we generate the sentence representation as follows:</p>
          <disp-formula><italic>h<sub>sent</sub>=f</italic> (<italic>h</italic><sup>(</sup><italic><sup>L</sup></italic><sup>)</sup>)<italic>=f</italic> (<italic>GCN</italic>(<italic>h</italic><sup>(</sup><italic><sup>0</sup></italic><sup>)</sup>) (13)</disp-formula>
          <p>where <italic>h</italic><sup>(</sup><italic><sup>L</sup></italic><sup>)</sup> denotes the output representations at the last layer <italic>L</italic> of the GCN model, and <italic>f:R<sup>n:R</sup>→R<sup>d</sup></italic> is a max-pooling function that maps <italic>n</italic> output vectors to the sentence vector.</p>
          <p>Inspired by recent studies [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>], entity information is central to relation classification. Therefore, we also obtain the chemical entity representation <italic>h<sub>c</sub></italic> as shown in <inline-graphic xlink:href="medinform_v8i7e17638_fig18.png" xlink:type="simple" mimetype="image"/>. Similarly, we can obtain the disease entity representation <italic>h<sub>d</sub></italic>. The feature representation of the whole GCN model is <italic>h<sub>GCN</sub></italic>=[<italic>h<sub>sent</sub></italic>; <italic>h<sub>c</sub></italic>; <italic>h<sub>d</sub></italic>].</p>
          <p>We also obtain the feature representation <italic>h<sub>att</sub></italic> from the output of the multihead attention layer by applying max pooling to the multihead attention matrix. We concatenate <italic>h<sub>GCN</sub></italic> and <italic>h<sub>att</sub></italic> to form the final representation <italic>h<sub>final</sub></italic>=[<italic>h<sub>GCN</sub></italic>; <italic>h<sub>att</sub></italic>] for relation classification. Then, the final representation is fed into a 2-layer perceptron as follows:</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig19.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (14) and (15).</p>
          <p>Finally, the hidden representation <italic>h<sub>2</sub></italic> is fed to a <italic>Softmax</italic> function to calculate the confidence of the CID relation:</p>
          <disp-formula><italic>o=softmax</italic> (<italic>W<sub>o</sub>h<sub>2</sub>+b<sub>o</sub></italic>) (16)</disp-formula>
          <p>where <italic>o</italic> is the output, <italic>W<sub>o</sub></italic> is the weight matrix, and <italic>b<sub>o</sub></italic> is the bias vector.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Dataset</title>
        <p>We evaluated our model on the CDR corpus, which was released by the BioCreative V task. The CDR dataset is the benchmark dataset for the CID relation extraction task, which consists of 1500 PubMed abstracts—500 each for training, development, and test set. <xref ref-type="table" rid="table1">Table 1</xref> shows the details of the dataset.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Statistics of the chemical-disease relation dataset.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="300"/>
            <col width="350"/>
            <col width="350"/>
            <thead>
              <tr valign="top">
                <td>Task dataset</td>
                <td>Abstracts (n=1500)</td>
                <td>Chemical-induced disease relations (n=3116)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Training</td>
                <td>500</td>
                <td>1038</td>
              </tr>
              <tr valign="top">
                <td>Development</td>
                <td>500</td>
                <td>1012</td>
              </tr>
              <tr valign="top">
                <td>Test</td>
                <td>500</td>
                <td>1066</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>In this study, the gold entity annotations provided by BioCreative V were used to evaluate our model. All the comparison methods reported in this paper were evaluated with gold entity annotations. Therefore, it is fair and comparable. Further, we measured the CID relation extraction performance with precision, recall, and F-measure.</p>
      </sec>
      <sec>
        <title>Experimental Settings</title>
        <p>The dimensions of POS embedding and position embedding are both 100. The dimension of ELMo is 1024. The dimensions of the LSTM hidden layer and the GCN layer are 500 with the dropout proportion of 0.5. The dimensions of 2-layer perceptron are also 500 with the dropout proportion of 0.5. Our model was trained by Adam [<xref ref-type="bibr" rid="ref30">30</xref>] with a learning rate of 0.001 and a minibatch size of 32. In addition, our model was implemented based on an open-source deep learning library PyTorch [<xref ref-type="bibr" rid="ref31">31</xref>]. We used StanfordNLP [<xref ref-type="bibr" rid="ref32">32</xref>] to obtain the POS of the word and the dependency tree. Further, we used the pretrained ELMo representations for the deep contextualized word representations.</p>
      </sec>
      <sec>
        <title>Experimental Results</title>
        <sec>
          <title>Effect of Input Representation</title>
          <p>We evaluated the effectiveness of the input representation of our model. We used the same model that we proposed and changed the input representations. The comparison performance of the different input representations is presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>The effect of the input representation on performance.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="250"/>
              <col width="250"/>
              <col width="250"/>
              <thead>
                <tr valign="top">
                  <td>Input representation</td>
                  <td>Precision (%)</td>
                  <td>Recall (%)</td>
                  <td>F-measure (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Word<sup>a</sup></td>
                  <td>47.3</td>
                  <td>71.7</td>
                  <td>57.0</td>
                </tr>
                <tr valign="top">
                  <td>Word+position<sup>b</sup></td>
                  <td>49.1</td>
                  <td>71.4</td>
                  <td>58.2</td>
                </tr>
                <tr valign="top">
                  <td>Word+position+POS<sup>c</sup></td>
                  <td>51.6</td>
                  <td>71.8</td>
                  <td>
                    <italic>60.1</italic>
                  </td>
                </tr>
                <tr valign="top">
                  <td>ELMo<sup>d</sup></td>
                  <td>57.0</td>
                  <td>67.4</td>
                  <td>61.8</td>
                </tr>
                <tr valign="top">
                  <td>ELMo+position<sup>e</sup></td>
                  <td>54.2</td>
                  <td>74.9</td>
                  <td>62.9</td>
                </tr>
                <tr valign="top">
                  <td>ELMo+position+POS<sup>f</sup></td>
                  <td>56.3</td>
                  <td>72.7</td>
                  <td>
                    <italic>63.5</italic>
                  </td>
                </tr>
                <tr valign="top">
                  <td>BioBERT+position+POS<sup>g</sup></td>
                  <td>57.9</td>
                  <td>70.1</td>
                  <td>63.4</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>The input representation of the model is the word embedding, which is pretrained by word2vec.</p>
              </fn>
              <fn id="table2fn2">
                <p><sup>b</sup>The input representation of the model is the concatenation of the word embedding and position embedding.</p>
              </fn>
              <fn id="table2fn3">
                <p><sup>c</sup>The input representation of the model is the concatenation of the word embedding, position embedding, and part of speech (POS) embedding. The F-measure (%) for this representation was an important finding.</p>
              </fn>
              <fn id="table2fn4">
                <p><sup>d</sup>The input representation of the model is the deep contextualized word representation.</p>
              </fn>
              <fn id="table2fn5">
                <p><sup>e</sup>The input representation of the model is the deep contextualized word representation and position embedding.</p>
              </fn>
              <fn id="table2fn6">
                <p><sup>f</sup>The input representation of the model is the deep contextualized word representation, position embedding, and POS embedding. The F-measure (%) for this representation was an important finding.</p>
              </fn>
              <fn id="table2fn7">
                <p><sup>g</sup>The word representation is generated from the last hidden layer of the bidirectional encoder representations from transformers for biomedical text mining (BioBERT) [<xref ref-type="bibr" rid="ref33">33</xref>] in a feature-based approach, which means that the parameters of the BioBERT are not fine-tuned. The input representation of the model is the BioBERT word representation, position embedding, and POS embedding.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>In <xref ref-type="table" rid="table2">Table 2</xref>, we can observe that the model achieves an F-measure of 57.0% when we only use the pretrained word embedding as the input representation. When we concatenate the pretrained word embedding and position embedding, the F-measure is improved from 57.0% to 58.2%, which yields a 1.2% improvement. When we concatenate the pretrained word embedding, position embedding, and POS embedding as the input representations, we yield another 1.9% improvement compared with only using the pretrained word embedding and position embedding. The result indicates that both POS and position features are effective for the CID relation extraction. The deep contextualized word representation ELMo significantly outperforms the pretrained word embedding and yields a 4.8% improvement in the F-measure. The result indicates that ELMo can generate a more comprehensive representation for the word according to the sentence context, which results in a better CDR performance. Similarly, combining the position and POS embedding with the deep contextualized word representation can further improve the performance. When we concatenate the deep contextualized word representation, position embedding, and POS embedding as the input representation, we achieve the best F-measure of 63.5%. We also use the word representations generated from the bidirectional encoder representations from transformers for biomedical text mining in a feature-based approach and achieve an F-measure of 63.4%, which is similar to using ELMo.</p>
        </sec>
        <sec>
          <title>Effect of the Attention Mechanism</title>
          <p>We evaluated the effectiveness of the multihead self-attention mechanism. We used the same model architecture that we proposed, but we dealt with the output of BiLSTM by different attention mechanisms. The attention mechanism is divided into 2 categories: single-head attention mechanism and multihead attention mechanism. In single-head attention mechanism, we use 3 types of attention function: additive attention, general attention, and scaled dot-product attention, as shown below.</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig20.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (17)</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig21.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (18)</p>
          <p><graphic xlink:href="medinform_v8i7e17638_fig22.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (19)</p>
          <p>where <italic>h<sub>i</sub></italic> is the output of the BiLSTM, <italic>W<sub>1</sub>, W<sub>2</sub>, s, v</italic> are the parameter matrixes, and <italic>d</italic> is the dimension of the output of the BiLSTM model. The formula of the multihead attention is described in formula (11). The comparison performance of the different attention mechanism is presented in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>The effect of the attention mechanism on performance.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="250"/>
              <col width="250"/>
              <col width="250"/>
              <thead>
                <tr valign="top">
                  <td>Attention mechanism</td>
                  <td>Precision (%)</td>
                  <td>Recall (%)</td>
                  <td>F-measure (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Without attention</td>
                  <td>55.1</td>
                  <td>71.3</td>
                  <td>62.2</td>
                </tr>
                <tr valign="top">
                  <td>Additive attention</td>
                  <td>55.9</td>
                  <td>70.3</td>
                  <td>62.3</td>
                </tr>
                <tr valign="top">
                  <td>General attention</td>
                  <td>55.3</td>
                  <td>71.8</td>
                  <td>62.5</td>
                </tr>
                <tr valign="top">
                  <td>Scaled dot-product attention</td>
                  <td>54.9</td>
                  <td>73.3</td>
                  <td> 62.8</td>
                </tr>
                <tr valign="top">
                  <td>Multihead attention</td>
                  <td>56.3</td>
                  <td>72.7</td>
                  <td>63.5</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p>In <xref ref-type="table" rid="table3">Table 3</xref>, we can see that using the attention mechanism can improve the performance of the CID relation extraction. The multihead attention mechanism is more helpful than other single-head attention mechanisms. This suggests that the multihead attention mechanism can capture more valuable features from different representation subspaces.</p>
        </sec>
        <sec>
          <title>Effect of the Attention Heads</title>
          <p>We evaluated the effectiveness of the number of heads of the multihead attention mechanism. In this comparative experiment, we used the deep contextualized word representation, position embedding, and POS embedding as the input representation, and the dimensions of query, key, and value are the same. As shown in <xref ref-type="table" rid="table4">Table 4</xref>, we only varied the number of heads of the multihead attention.</p>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>The effect of the attention heads on performance.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="250"/>
              <col width="250"/>
              <col width="250"/>
              <thead>
                <tr valign="top">
                  <td>Heads (n)</td>
                  <td>Precision (%)</td>
                  <td>Recall (%)</td>
                  <td>F-measure (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>2</td>
                  <td>57.2</td>
                  <td>68.2</td>
                  <td>62.2</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>56.9</td>
                  <td>70.6</td>
                  <td>63.0</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>56.3</td>
                  <td>72.7</td>
                  <td>63.5</td>
                </tr>
                <tr valign="top">
                  <td>8</td>
                  <td>57.0</td>
                  <td>70.2</td>
                  <td>62.9</td>
                </tr>
                <tr valign="top">
                  <td>10</td>
                  <td>54.4</td>
                  <td>75.4</td>
                  <td>63.2</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p>In <xref ref-type="table" rid="table4">Table 4</xref>, we can see that the multihead attention mechanism can effectively improve the performance of the CID relation extraction. We can observe that the F-measure ranges from 62.2% to 63.5% when setting a different number of heads. When the number of heads is too little or too large, the performance will drop off. In short, we achieve the best F-measure of 63.5% when we set the number of heads as 5.</p>
        </sec>
      </sec>
      <sec>
        <title>Ablation Study</title>
        <p>To examine the contributions of the 2 main components, namely, multihead attention layer and GCN layer, we ran an ablation study. The experimental results are shown in <xref ref-type="table" rid="table5">Table 5</xref>. The results contain intrasentence level, intersentence level, and relation merging, which means that merging the intrasentence and intersentence level results in the final document-level result.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>An ablation study for our model.a</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="110"/>
            <col width="110"/>
            <col width="90"/>
            <col width="100"/>
            <col width="100"/>
            <col width="90"/>
            <col width="100"/>
            <col width="110"/>
            <col width="90"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td rowspan="2">Model</td>
                <td colspan="3">Intrasentence level</td>
                <td colspan="3">Intersentence level</td>
                <td colspan="3">Relation merging</td>
              </tr>
              <tr valign="top">
                <td>Precision (%)</td>
                <td>Recall (%)</td>
                <td>F-measure (%)</td>
                <td>Precision (%)</td>
                <td>Recall (%)</td>
                <td>F-measure (%)</td>
                <td>Precision (%)</td>
                <td>Recall (%)</td>
                <td>F-measure (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Without multihead attention</td>
                <td>58.2</td>
                <td>82.9</td>
                <td>68.4</td>
                <td>44.7</td>
                <td>44.3</td>
                <td>44.5</td>
                <td>55.1</td>
                <td>71.3</td>
                <td>62.2</td>
              </tr>
              <tr valign="top">
                <td>Without GCN<sup>b</sup></td>
                <td>62.6</td>
                <td>74.1</td>
                <td>67.9</td>
                <td>43.6</td>
                <td>48.4</td>
                <td>45.9</td>
                <td>57.1</td>
                <td>66.4</td>
                <td>61.4</td>
              </tr>
              <tr valign="top">
                <td>Our model</td>
                <td>59.1</td>
                <td>81.5</td>
                <td>
                  <italic>68.5</italic>
                </td>
                <td>47.8</td>
                <td>52.2</td>
                <td>
                  <italic>49.9</italic>
                </td>
                <td>56.3</td>
                <td>72.7</td>
                <td>
                  <italic>63.5</italic>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>The values in italics indicate significant findings.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>GCN: graph convolutional network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>We can observe that removing either the multihead attention layer or the GCN layer reduces the performance of the model. This suggests that both layers can learn effective features. When we remove the multihead attention layer and the GCN layer, the F-measure drops by 1.3% and 2.1%, respectively. In particular, we can observe that adding either the multihead attention layer or the GCN layer improves the performance in the intersentence level relation extraction by a large margin. When we remove the multihead attention layer and the GCN layer, the intersentence level F-measure drops by 5.4% and 4.0%, respectively. This suggests that the multihead attention layer can capture the relatively important features from different representation subspaces and the GCN layer can capture long-range syntactic features for intersentence level relation extraction.</p>
        <p/>
      </sec>
      <sec>
        <title>Comparison with Related Work</title>
        <p>We compared our model with several state-of-the-art methods of the CID relation extraction. These methods are divided into 2 categories: methods without additional resources (without knowledge bases) and methods using additional resources (with knowledge bases). These following methods have been summarized in <xref ref-type="table" rid="table6">Table 6</xref>.</p>
        <list list-type="order">
          <list-item>
            <p>Pattern rule-based: Lowe et al [<xref ref-type="bibr" rid="ref5">5</xref>] developed a pattern-based system with some heuristic rules to extract CID relations within the same sentence, and they achieved an F-measure of 60.8%.</p>
          </list-item>
          <list-item>
            <p>Maximum entropy model: Gu et al [<xref ref-type="bibr" rid="ref7">7</xref>] developed a machine learning-based system that utilized simple but effective manual linguistic features with the maximum entropy model. They built rich manual features for intrasentence level and intersentence level instances. They achieved an F-measure of 58.3%.</p>
          </list-item>
          <list-item>
            <p>LSTM+ support vector machine (SVM): Zhou et al [<xref ref-type="bibr" rid="ref10">10</xref>] developed a hybrid system, which consists of a feature-based model that utilized flat features and structure features with SVM and a neural network model based on LSTM. Their model achieved an F-measure of 56.0%. After using additional postprocessing heuristic rules, they achieved a 5.3% improvement in the F-measure.</p>
          </list-item>
          <list-item>
            <p>CNN+maximum entropy: Gu et al [<xref ref-type="bibr" rid="ref11">11</xref>] proposed a maximum entropy model for intersentence level relation extraction and a CNN model for intrasentence level relation extraction. They achieved an F-measure of 60.2%. They also used additional postprocessing heuristic rules to improve performance that increases the F-measure to 61.3%.</p>
          </list-item>
          <list-item>
            <p>Biaffine Relation Attention Network: Verga et al [<xref ref-type="bibr" rid="ref12">12</xref>] proposed this based on the multihead self-attention model, which can predict relationships between all the mentioned pairs in the document. The model achieved an F-measure of 62.1%.</p>
          </list-item>
          <list-item>
            <p>Graph convolutional neural network: Sahu et al [<xref ref-type="bibr" rid="ref18">18</xref>] proposed a labelled edge graph convolutional neural network model on a document-level graph. The model achieved an F-measure of 58.6%.</p>
          </list-item>
          <list-item>
            <p>SVM_Xu: Xu et al [<xref ref-type="bibr" rid="ref6">6</xref>] explored 4 different knowledge bases to extract the knowledge features and achieved an F-measure of 67.2%.</p>
          </list-item>
          <list-item>
            <p>SVM_Pons: Pons et al [<xref ref-type="bibr" rid="ref9">9</xref>] extracted 3 sets of features, which are prior knowledge and statistical and linguistic information from the document. They achieved an F-measure of 70.2%.</p>
          </list-item>
          <list-item>
            <p>Knowledge-guided convolutional network: Zhou et al [<xref ref-type="bibr" rid="ref34">34</xref>] proposed a CNN that integrated both relation representations and entity representations learned from knowledge bases. The model achieved an F-measure of 71.3%.</p>
          </list-item>
        </list>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Comparisons with related work.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="20"/>
            <col width="30"/>
            <col width="600"/>
            <col width="120"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td colspan="3"> Category and method</td>
                <td>Precision (%)</td>
                <td colspan="3">Recall (%)</td>
                <td>F-measure (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="8">
                  <bold>Without knowledge bases</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">
                  <bold>Lowe et al [<xref ref-type="bibr" rid="ref5">5</xref>]</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> <break/>  <break/>  </td>
                <td>Pattern rule-based</td>
                <td>59.3</td>
                <td colspan="3">62.3</td>
                <td>60.8</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">
                  <bold>Gu et al [<xref ref-type="bibr" rid="ref7">7</xref>]</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td>ME<sup>a</sup></td>
                <td>62.0</td>
                <td colspan="3">55.1</td>
                <td>58.3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">    <bold>Zhou et al [<xref ref-type="bibr" rid="ref10">10</xref>]</bold></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td> LSTM+SVM<sup>b</sup></td>
                <td>64.9</td>
                <td colspan="3">49.3</td>
                <td>56.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td> LSTM+SVM+PP<sup>c</sup></td>
                <td>55.6</td>
                <td colspan="3">68.4</td>
                <td>61.3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">   <bold>Gu et al [<xref ref-type="bibr" rid="ref11">11</xref>]</bold></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td> CNN+ME<sup>d</sup></td>
                <td>60.9</td>
                <td colspan="3">59.5</td>
                <td>60.2</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td> CNN+ME+PP</td>
                <td>55.7</td>
                <td colspan="3">68.1</td>
                <td>61.3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">
                  <bold>Verga et al [<xref ref-type="bibr" rid="ref12">12</xref>]</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td>BRAN<sup>e</sup></td>
                <td colspan="2">55.6</td>
                <td>70.8</td>
                <td colspan="2">62.1</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">
                  <bold>Sahu et al [<xref ref-type="bibr" rid="ref18">18</xref>]</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td>GCNN<sup>f</sup></td>
                <td colspan="2">52.8</td>
                <td>66.0</td>
                <td colspan="2">58.6</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">
                  <bold>Our study</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td>GCN<sup>g</sup>+Multihead attention</td>
                <td colspan="2">56.3</td>
                <td>72.7</td>
                <td colspan="2">63.5</td>
              </tr>
              <tr valign="top">
                <td colspan="8">
                  <bold>With knowledge bases</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">
                  <bold>Xu et al [<xref ref-type="bibr" rid="ref6">6</xref>]</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td>SVM</td>
                <td colspan="2">65.8</td>
                <td>68.6</td>
                <td colspan="2">67.2</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">
                  <bold>Pons et al [<xref ref-type="bibr" rid="ref9">9</xref>]</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td>SVM</td>
                <td colspan="2">73.1</td>
                <td>67.6</td>
                <td colspan="2">70.2</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="7">
                  <bold>Zhou et al [<xref ref-type="bibr" rid="ref34">34</xref>]</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td> </td>
                <td>KCN<sup>h</sup></td>
                <td colspan="2">69.7</td>
                <td>72.9</td>
                <td colspan="2">71.3</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>ME: maximum entropy model.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>LSTM+SVM: long short-term memory+support vector machine.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>LSTM+SVM+PP: long short-term memory+support vector machine+postprocessing.</p>
            </fn>
            <fn id="table6fn4">
              <p><sup>d</sup>CNN+ME: convolutional neural network+maximum entropy model.</p>
            </fn>
            <fn id="table6fn5">
              <p><sup>e</sup>BRAN: biaffine relation attention network.</p>
            </fn>
            <fn id="table6fn6">
              <p><sup>f</sup>GCNN: graph convolutional neural network.</p>
            </fn>
            <fn id="table6fn7">
              <p><sup>g</sup>GCN: graph convolutional network.</p>
            </fn>
            <fn id="table6fn8">
              <p><sup>h</sup>KCN: knowledge-guided convolutional networks.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>In <xref ref-type="table" rid="table6">Table 6</xref>, the deep neural network-based methods achieved competitive performance in the CID relation extraction task. For example, Sahu et al [<xref ref-type="bibr" rid="ref18">18</xref>] used GCN to capture dependency information and achieved an F-measure of 58.6%. Compared with other deep neural network-based methods, we not only employed the multihead attention to capture the relatively important semantic features but also used the GCN to capture the valuable syntactic features from the document-level dependency graph automatically and effectively. We also observed that some studies [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>] designed and extracted rich semantic and syntactic features for the relation extraction task and used additional postprocessing heuristic rules to improve performance. Our method is an end-to-end neural network-based model and achieves a high F-measure of 63.5% without using postprocessing heuristic rules. As shown in <xref ref-type="table" rid="table6">Table 6</xref>, the methods with knowledge bases outperform the methods without knowledge bases significantly. This suggests that prior knowledge is much useful for CID relation extraction. In this study, we focus on the effectiveness of GCN and multihead attention mechanism rather than the prior knowledge. We will attempt to integrate the biomedical knowledge to further improve the performance of our method in our future work.</p>
      </sec>
      <sec>
        <title>Visualization of Multihead Attention Mechanisms</title>
        <p>To understand our multihead self-attention mechanism clearly, we visualized the attention weights of an example sequence in <xref rid="figure4" ref-type="fig">Figure 4</xref>. Different colors represent different heads. The darker the color is, the higher the attention weight is. In <xref rid="figure4" ref-type="fig">Figure 4</xref>, the word pays different levels of attention to different words in different heads. For the word “Cardiac,” the word “Pilsicainide” has the higher weight score in the second head; however, the words “Torsades” and “Pointes” have the higher weight score in the last head. For the word “Pilsicainide,” the words “Cardiac” and “Death” have the higher weight score in the third head; however, the word “Torsades” has the higher weight score in the fourth head. Thus, the multihead self-attention mechanism can make the model capture the relatively important features from different representation subspaces.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Examples of the multi-head self-attention mechanism. Attentions here shown only for the words "Cardiac" and "Pilsicainide." Different colors represent different heads.</p>
          </caption>
          <graphic xlink:href="medinform_v8i7e17638_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Error Analysis</title>
        <p>To understand our model better, we performed an error analysis on the output of our final results. There are the 2 main types of errors: false positive errors and false negative errors. We list some examples to analyze the errors. In false positive errors, some instances are nonrelations but are mistaken as CID relations. For the sentences “Carbamazepine (Chemical: D002220)-induced cardiac dysfunction (Disease: D006331)” and “A patient with sinus bradycardia and atrioventricular block (Disease: D054537) induced by carbamazepine (Chemical: D002220),” the disease D006331 is the hypernym of the disease D054537. According to the labeling rules of the CDR corpus, we need to extract the most specific relations. Thus, the first sentence does not express a CID relation and the second sentence expresses a CID relation. However, our model extracts a CID relation between the chemical D002220 and the disease D006331 in the first sentence incorrectly because the first sentence is the common sentence pattern that expresses a CID relation. In false negative errors, several CID relations are not recognized. One of the main reasons for some intersentence level instances to be removed by the heuristic rules in the relation instance construction stage is because the sentence distance is more than 3. In the future, we will consider preferable preprocessing and postprocessing techniques to solve the above problems.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>In this paper, we propose a novel end-to-end neural network based on GCN and multihead attention. The document-level dependency graph is constructed to capture the dependency syntactic information across sentences. We applied GCN to capture the long-range dependency syntactic features, which can improve the performance of intersentence level relation extraction. Further, we employed the multihead attention mechanism to capture the relatively important context features from different semantic subspaces. ELMo is used in our model to enhance the input representation. We evaluate the effectiveness of ELMo, multihead attention mechanism, and GCN on the BioCreative V CDR dataset. Experimental results show that ELMo, multihead attention, and GCN can significantly improve the performance of the CDR extraction. Our method achieves an F-measure of 63.5%, which is superior to other state-of-the-art methods. There are many large-scale knowledge bases such as the Comparative Toxicogenomics Database, Unified Medical Language System, Medical Subject Headings, UniProt, and the commercial system Euretos Knowledge Platform. These knowledge bases contain a large amount of structured data in the form of triples (entity, relation, entity), wherein relation represents the relationship between 2 entities. Some studies suggest that integrating the structured information from the knowledge bases may improve the performance of the CDR extraction. In future studies, we will integrate the biomedical knowledge to further improve the performance of our method.</p>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BiLSTM</term>
          <def>
            <p>bidirectional long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CDR</term>
          <def>
            <p>chemical-disease relation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CID</term>
          <def>
            <p>chemical-induced disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">GCN</term>
          <def>
            <p>graph convolutional network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">POS</term>
          <def>
            <p>part of speech</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The work was supported by grants from National Natural Science Foundation of China (No. 61572098 and 61572102). We would like to thank the Natural Science Foundation of China. We also would like to thank all the anonymous reviewers for their valuable suggestions and constructive comments.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>JW and YZ led the method application, experiment conduction, and the result analysis. XC, YZ, and JW participated in the data extraction and preprocessing. YZ and XW participated in the manuscript revision. HL and ZY provided theoretical guidance and the revision of this paper.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Islamaj Dogan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>GC</given-names>
            </name>
            <name name-style="western">
              <surname>Névéol</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Understanding PubMed user search behavior through log analysis</article-title>
          <source>Database (Oxford)</source>
          <year>2009</year>
          <month>11</month>
          <day>27</day>
          <volume>2009</volume>
          <fpage>bap018</fpage>
          <lpage>bap018</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bap018"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bap018</pub-id>
          <pub-id pub-id-type="medline">20157491</pub-id>
          <pub-id pub-id-type="pmcid">PMC2797455</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hurle</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpal</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Sanseau</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Computational drug repositioning: from data to therapeutics</article-title>
          <source>Clin Pharmacol Ther</source>
          <year>2013</year>
          <month>04</month>
          <day>15</day>
          <volume>93</volume>
          <issue>4</issue>
          <fpage>335</fpage>
          <lpage>41</lpage>
          <pub-id pub-id-type="doi">10.1038/clpt.2013.1</pub-id>
          <pub-id pub-id-type="medline">23443757</pub-id>
          <pub-id pub-id-type="pii">clpt20131</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ouyang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Distant supervision for neural relation extraction integrated with word attention and property features</article-title>
          <source>Neural Netw</source>
          <year>2018</year>
          <month>04</month>
          <volume>100</volume>
          <fpage>59</fpage>
          <lpage>69</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neunet.2018.01.006</pub-id>
          <pub-id pub-id-type="medline">29471196</pub-id>
          <pub-id pub-id-type="pii">S0893-6080(18)30006-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Leaman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Mattingly</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wiegers</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Assessing the state of the art in biomedical relation extraction: overview of the BioCreative V chemical-disease relation (CDR) task</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>baw032</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw032"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw032</pub-id>
          <pub-id pub-id-type="medline">26994911</pub-id>
          <pub-id pub-id-type="pmcid">PMC4799720</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lowe</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>O'Boyle</surname>
              <given-names>Noel M</given-names>
            </name>
            <name name-style="western">
              <surname>Sayle</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>Efficient chemical-disease identification and relationship extraction using Wikipedia to improve recall</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <month>04</month>
          <day>08</day>
          <volume>2016</volume>
          <fpage>baw039</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw039"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw039</pub-id>
          <pub-id pub-id-type="medline">27060160</pub-id>
          <pub-id pub-id-type="pii">baw039</pub-id>
          <pub-id pub-id-type="pmcid">PMC4825350</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>CD-REST: a system for extracting chemical-induced disease relation in literature</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>baw036</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw036"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw036</pub-id>
          <pub-id pub-id-type="medline">27016700</pub-id>
          <pub-id pub-id-type="pmcid">PMC4808251</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Chemical-induced disease relation extraction with various linguistic features</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <month>04</month>
          <day>06</day>
          <volume>2016</volume>
          <fpage>baw042</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw042"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw042</pub-id>
          <pub-id pub-id-type="medline">27052618</pub-id>
          <pub-id pub-id-type="pii">baw042</pub-id>
          <pub-id pub-id-type="pmcid">PMC4822558</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Chemical-disease relations extraction based on the shortest dependency path tree</article-title>
          <year>2015</year>
          <conf-name>Proceedings of the Fifth BioCreative Challenge Evaluation Workshop</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>Seville, Spain</conf-loc>
          <fpage>214</fpage>
          <lpage>219</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://pdfs.semanticscholar.org/e66a/754947a9abd6665ab16815f52bc1c9aed596.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pons</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Becker</surname>
              <given-names>BF</given-names>
            </name>
            <name name-style="western">
              <surname>Akhondi</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>van Mulligen</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Kors</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Extraction of chemical-induced diseases using prior knowledge and textual information</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <month>04</month>
          <day>14</day>
          <volume>2016</volume>
          <fpage>baw046</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw046"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw046</pub-id>
          <pub-id pub-id-type="medline">27081155</pub-id>
          <pub-id pub-id-type="pii">baw046</pub-id>
          <pub-id pub-id-type="pmcid">PMC4831722</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Exploiting syntactic and semantics information for chemical-disease relation extraction</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <month>04</month>
          <day>14</day>
          <volume>2016</volume>
          <fpage>baw048</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw048"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw048</pub-id>
          <pub-id pub-id-type="medline">27081156</pub-id>
          <pub-id pub-id-type="pii">baw048</pub-id>
          <pub-id pub-id-type="pmcid">PMC4831723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Chemical-induced disease relation extraction via convolutional neural network</article-title>
          <source>Database (Oxford)</source>
          <year>2017</year>
          <month>01</month>
          <day>01</day>
          <volume>2017</volume>
          <issue>1</issue>
          <fpage>bax024</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bax024"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bax024</pub-id>
          <pub-id pub-id-type="medline">28415073</pub-id>
          <pub-id pub-id-type="pii">3098440</pub-id>
          <pub-id pub-id-type="pmcid">PMC5467558</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Verga</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Strubell</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mccallum</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Simultaneously self-attending to all mentions for full-abstract biological relation extraction</article-title>
          <year>2018</year>
          <conf-name>the 16th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>New Orleans, USA</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/N18-1080</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Verspoor</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Convolutional neural networks for chemical-disease relation extraction are improved with character-based word embeddings</article-title>
          <year>2018</year>
          <conf-name>Proceedings of the BioNLP workshop</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
          <fpage>129</fpage>
          <lpage>136</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/W18-2314</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Deep contextualized word representations</article-title>
          <year>2018</year>
          <conf-name>the 16th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>New Orleans, USA</conf-loc>
          <fpage>2227</fpage>
          <lpage>2237</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/N18-1202</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nie</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A Multi-attention-Based Bidirectional Long Short-Term Memory Network for Relation Extraction</article-title>
          <year>2017</year>
          <conf-name>International Conference on Neural Information Processing</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Guangzhou, China</conf-loc>
          <fpage>216</fpage>
          <lpage>227</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-319-70139-4_22</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <year>2017</year>
          <conf-name>Neural Information Processing Systems(NIPS)</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Long Beach, USA</conf-loc>
          <fpage>5998</fpage>
          <lpage>6008</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/pdf/10.5555/3295222.3295349"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quirk</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Poon</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Distant supervision for relation extraction beyond the sentence boundary</article-title>
          <year>2017</year>
          <conf-name>the 15th Conference of the European Chapter of the Association for Computational Linguistics</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Valencia, Spain</conf-loc>
          <fpage>1171</fpage>
          <lpage>1182</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/e17-1110</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sahu</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Christopoulou</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Miwa</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ananiadou</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Inter-sentence Relation Extraction with Document-level Graph Convolutional Neural Network</article-title>
          <year>2019</year>
          <conf-name>the 57th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <fpage>4309</fpage>
          <lpage>4316</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/p19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krallinger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vazquez</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leitner</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Salgado</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chatr-Aryamontri</surname>
              <given-names>Andrew</given-names>
            </name>
            <name name-style="western">
              <surname>Winter</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Perfetto</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Briganti</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Licata</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Iannuccelli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Castagnoli</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cesareni</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Tyers</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schneider</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rinaldi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Leaman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Matos</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wilbur</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Rocha</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shatkay</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tendulkar</surname>
              <given-names>AV</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Rak</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Noto</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Elkan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dogan</surname>
              <given-names>RI</given-names>
            </name>
            <name name-style="western">
              <surname>Fontaine</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Andrade-Navarro</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Valencia</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The Protein-Protein Interaction tasks of BioCreative III: classification/ranking of articles and linking bio-ontology concepts to full text</article-title>
          <source>BMC Bioinformatics</source>
          <year>2011</year>
          <month>10</month>
          <day>03</day>
          <volume>12 Suppl 8</volume>
          <fpage>S3</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-S8-S3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-12-S8-S3</pub-id>
          <pub-id pub-id-type="medline">22151929</pub-id>
          <pub-id pub-id-type="pii">1471-2105-12-S8-S3</pub-id>
          <pub-id pub-id-type="pmcid">PMC3269938</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Segura-Bedmar</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Martínez</surname>
              <given-names>Paloma</given-names>
            </name>
            <name name-style="western">
              <surname>Herrero-Zazo</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Lessons learnt from the DDIExtraction-2013 Shared Task</article-title>
          <source>J Biomed Inform</source>
          <year>2014</year>
          <month>10</month>
          <volume>51</volume>
          <fpage>152</fpage>
          <lpage>64</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(14)00124-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2014.05.007</pub-id>
          <pub-id pub-id-type="medline">24858490</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(14)00124-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Coletti</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Bleich</surname>
              <given-names>HL</given-names>
            </name>
          </person-group>
          <article-title>Medical subject headings used to search the biomedical literature</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2001</year>
          <volume>8</volume>
          <issue>4</issue>
          <fpage>317</fpage>
          <lpage>23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/11418538"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2001.0080317</pub-id>
          <pub-id pub-id-type="medline">11418538</pub-id>
          <pub-id pub-id-type="pmcid">PMC130076</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Drug drug interaction extraction from biomedical literature using syntax convolutional neural network</article-title>
          <source>Bioinformatics</source>
          <year>2016</year>
          <month>11</month>
          <day>15</day>
          <volume>32</volume>
          <issue>22</issue>
          <fpage>3444</fpage>
          <lpage>3453</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27466626"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btw486</pub-id>
          <pub-id pub-id-type="medline">27466626</pub-id>
          <pub-id pub-id-type="pii">btw486</pub-id>
          <pub-id pub-id-type="pmcid">PMC5181565</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dumontier</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Drug-drug interaction extraction via hierarchical RNNs on sequence and shortest dependency paths</article-title>
          <source>Bioinformatics</source>
          <year>2018</year>
          <month>03</month>
          <day>01</day>
          <volume>34</volume>
          <issue>5</issue>
          <fpage>828</fpage>
          <lpage>835</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29077847"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btx659</pub-id>
          <pub-id pub-id-type="medline">29077847</pub-id>
          <pub-id pub-id-type="pii">4565590</pub-id>
          <pub-id pub-id-type="pmcid">PMC6030919</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hochreiter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidhuber</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Long short-term memory</article-title>
          <source>Neural Comput</source>
          <year>1997</year>
          <month>11</month>
          <day>15</day>
          <volume>9</volume>
          <issue>8</issue>
          <fpage>1735</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id>
          <pub-id pub-id-type="medline">9377276</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bahdanau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Neural Machine Translation by Jointly Learning to Align and Translate</article-title>
          <year>2015</year>
          <conf-name>3rd International Conference on Learning Representations</conf-name>
          <conf-date>May 7-9, 2015</conf-date>
          <conf-loc>San Diego, CA, USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1409.0473.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lecun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Bottou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Haffner</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Gradient-based learning applied to document recognition</article-title>
          <source>Proc. IEEE</source>
          <year>1998</year>
          <month>11</month>
          <volume>86</volume>
          <issue>11</issue>
          <fpage>2278</fpage>
          <lpage>2324</lpage>
          <pub-id pub-id-type="doi">10.1109/5.726791</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
          </person-group>
          <article-title>Graph Convolution over Pruned Dependency Trees Improves Relation Extraction</article-title>
          <year>2018</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
          <fpage>2205</fpage>
          <lpage>2215</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/d18-1244</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Santoro</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Raposo</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Barrett</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Malinowski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pascanu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Battaglia</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lillicrap</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A simple neural network module for relational reasoning</article-title>
          <year>2017</year>
          <conf-name>Advances in Neural Information Processing Systems</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Long Beach, USA</conf-loc>
          <fpage>4967</fpage>
          <lpage>4976</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper/7082-a-simple-neural-network-module-for-relational-reasoning.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>End-to-end neural coreference resolution</article-title>
          <year>2017</year>
          <conf-name>the 2017 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Copenhagen, Denmark</conf-loc>
          <fpage>188</fpage>
          <lpage>197</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/d17-1018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Adam:A Method for Stochastic Optimization</article-title>
          <year>2015</year>
          <conf-name>the 3rd International Conference for Learning Representations</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>San Diego</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1412.6980.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <article-title>An open source machine learning framework that accelerates the path from research prototyping to production deployment</article-title>
          <source>PyTorch: From Research to Production</source>
          <access-date>2020-04-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://pytorch.org/">https://pytorch.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <article-title>Software</article-title>
          <source>The Stanford Natural Language Processing Group</source>
          <access-date>2020-05-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://nlp.stanford.edu/software/">https://nlp.stanford.edu/software/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>1240</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ning</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Knowledge-guided convolutional networks for chemical-disease relation extraction</article-title>
          <source>BMC Bioinformatics</source>
          <year>2019</year>
          <month>05</month>
          <day>21</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>260</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2873-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-019-2873-7</pub-id>
          <pub-id pub-id-type="medline">31113357</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-019-2873-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6528333</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
