<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i10e41136</article-id>
      <article-id pub-id-type="pmid">36264604</article-id>
      <article-id pub-id-type="doi">10.2196/41136</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Relation Extraction in Biomedical Texts Based on Multi-Head Attention Model With Syntactic Dependency Feature: Modeling Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Hefner</surname>
            <given-names>Jennifer</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Cui</surname>
            <given-names>Yutao</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wang</surname>
            <given-names>Min</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Yongbin</given-names>
          </name>
          <degrees>ME</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Medical Information Engineering</institution>
            <institution>Zunyi Medical University</institution>
            <addr-line>6 Xuefu Road West, Xinpu New District</addr-line>
            <addr-line>Zunyi, 563000</addr-line>
            <country>China</country>
            <phone>86 18311545098</phone>
            <email>bynn456@126.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3163-8448</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Hui</surname>
            <given-names>Linhu</given-names>
          </name>
          <degrees>ME</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9067-8197</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Zou</surname>
            <given-names>Liping</given-names>
          </name>
          <degrees>ME</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5119-259X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Huyang</given-names>
          </name>
          <degrees>ME</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7590-6331</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Xu</surname>
            <given-names>Luo</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6942-896X</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Xiaohua</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1367-6264</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Chua</surname>
            <given-names>Stephanie</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6550-6828</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Medical Information Engineering</institution>
        <institution>Zunyi Medical University</institution>
        <addr-line>Zunyi</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Faculty of Computer Science and Information Technology</institution>
        <institution>University Malaysia Sarawak</institution>
        <addr-line>Sarawak</addr-line>
        <country>Malaysia</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yongbin Li <email>bynn456@126.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>10</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>20</day>
        <month>10</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>10</issue>
      <elocation-id>e41136</elocation-id>
      <history>
        <date date-type="received">
          <day>16</day>
          <month>7</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>2</day>
          <month>8</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>27</day>
          <month>8</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>7</day>
          <month>9</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Yongbin Li, Linhu Hui, Liping Zou, Huyang Li, Luo Xu, Xiaohua Wang, Stephanie Chua. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 20.10.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/10/e41136" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>With the rapid expansion of biomedical literature, biomedical information extraction has attracted increasing attention from researchers. In particular, relation extraction between 2 entities is a long-term research topic.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to perform 2 multiclass relation extraction tasks of Biomedical Natural Language Processing Workshop 2019 Open Shared Tasks: relation extraction of Bacteria-Biotope (BB-rel) task and binary relation extraction of plant seed development (SeeDev-binary) task. In essence, these 2 tasks are aimed at extracting the relation between annotated entity pairs from biomedical texts, which is a challenging problem.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Traditional research methods adopted feature- or kernel-based methods and achieved good performance. For these tasks, we propose a deep learning model based on a combination of several distributed features, such as domain-specific word embedding, part-of-speech embedding, entity-type embedding, distance embedding, and position embedding. The multi-head attention mechanism is used to extract the global semantic features of an entire sentence. Meanwhile, we introduced a dependency-type feature and the shortest dependency path connecting 2 candidate entities in the syntactic dependency graph to enrich the feature representation.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Experiments show that our proposed model has excellent performance in biomedical relation extraction, achieving <italic>F</italic><sub>1</sub> scores of 65.56% and 38.04% on the test sets of the BB-rel and SeeDev-binary tasks. Especially in the SeeDev-binary task, the <italic>F</italic><sub>1</sub> score of our model is superior to that of other existing models and achieves state-of-the-art performance.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We demonstrated that the multi-head attention mechanism can learn relevant syntactic and semantic features in different representation subspaces and different positions to extract comprehensive feature representation. Moreover, syntactic dependency features can improve the performance of the model by learning dependency relation between the entities in biomedical texts.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>biomedical relation extraction</kwd>
        <kwd>deep learning</kwd>
        <kwd>feature combination</kwd>
        <kwd>multi-head attention</kwd>
        <kwd>additive attention</kwd>
        <kwd>syntactic dependency feature</kwd>
        <kwd>syntactic dependency graph</kwd>
        <kwd>shortest dependency path</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Information extraction (IE) [<xref ref-type="bibr" rid="ref1">1</xref>] involves extracting specific events or related information from texts; automatically classifying, extracting, and reconstructing useful information from massive amounts of content; and transforming it into structured knowledge. With the increasing demand for text mining technology to locate key information in biomedical literature, biomedical IE [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>] has become a new research hot spot. Simultaneously, with the explosive development of biomedical literature, many research directions for biomedical IE have been promoted, such as named entity recognition, protein relation extraction [<xref ref-type="bibr" rid="ref4">4</xref>], and drug interaction extraction [<xref ref-type="bibr" rid="ref5">5</xref>]. In particular, it is a challenging and practical problem to detect the relation between annotated entities in the biomedical text under relation constraints, which is an important research direction.</p>
        <p>The Biomedical Natural Language Processing Workshop-Open Shared Task (BioNLP-OST) series [<xref ref-type="bibr" rid="ref6">6</xref>] is representative of biomolecular IE, which aims to facilitate the development and sharing of biomedical text mining and fine-grained IE. BioNLP-OST has made a great contribution to the development of biomedical IE and has been held for 5 times. The research topics of BioNLP-OST include fine-grained event extraction, biomedical knowledge base construction, and other scopes. This study mainly focused on the relation extraction of Bacteria-Biotope (BB-rel) task and the binary relation extraction of plant seed development (SeeDev-binary) task in BioNLP-OST 2019 [<xref ref-type="bibr" rid="ref7">7</xref>]. These 2 multiclass subtasks are essential for predicting whether and what relationship exists between 2 annotated entities. This study contributes to the development of practical applications for biomedical text mining.</p>
        <p>A series of innovative systems have achieved good results and actively promoted the development of biomedical IE. For example, in BB-rel and SeeDev-binary tasks, traditional relation extraction models are mainly based on feature-based [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] and kernel-based methods [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. These methods rely on domain-specific knowledge or language tools to extract artificial features. For example, in the study by Björne and Salakoski [<xref ref-type="bibr" rid="ref12">12</xref>], a relation extraction system was constructed using a feature based on the shortest dependent path and support vector machine (SVM). In recent years, deep learning (DL) models have been successfully applied in many fields of natural language processing, requiring less feature engineering and automatic learning of useful information from corpus data (Kumar, S, unpublished data, May 2017). In the biomedical relation extraction field, several well-known DL models have been gradually applied and have achieved excellent performance, including distributed representation [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], convolutional neural network (CNN) [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>], and recurrent neural network [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. Consequently, instead of complicating handcrafted feature engineering, we used the DL method to extract relations in biomedical texts.</p>
        <p>The combined application of the distributed features of a full sentence is the most common method for biomedical relation extraction [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Here, we use a variety of distributed features, such as domain-specific word embedding [<xref ref-type="bibr" rid="ref23">23</xref>], part of speech (POS) embedding [<xref ref-type="bibr" rid="ref24">24</xref>], entity-type embedding [<xref ref-type="bibr" rid="ref13">13</xref>], and distance embedding [<xref ref-type="bibr" rid="ref25">25</xref>]. However, the commonly used model is difficult to focus on the key information of full sentence; therefore, the attention mechanism [<xref ref-type="bibr" rid="ref26">26</xref>] has been proposed and proven to be successful in a wide range of natural languages processing fields, such as machine translation, reading comprehension, and sentiment classification [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. In our proposed model, we use the multi-head attention mechanism proposed by Vaswani et al [<xref ref-type="bibr" rid="ref30">30</xref>] to deal with the combination of distributed features of the full sentence. Multi-head attention can ignore the distance between words, directly calculate the dependency between words, and learn the syntactic and semantic features of sentences in different representation subspaces. We also constructed position embedding (PE) to inject position information to take advantage of the order of words in a sentence.</p>
        <p>In our proposed model, we also integrated the shortest dependency path and dependency-type feature based on the syntactic dependency graph as one of the input features, which has been proven to be effective in several studies [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Although syntactic dependency features contain valuable syntactic information to facilitate the extraction of biomedical relations, they may still lose important information, such as prepositions before or after entities are likely to be discarded on the dependency path, which should play a key role [<xref ref-type="bibr" rid="ref33">33</xref>]. Hence, this study adopts the combination of distributed features and syntactic dependency features as the final feature representation of biomedical texts, in which syntactic dependency features exist as supplementary features.</p>
        <p>In this paper, we introduce a DL model to solve 2 biomedical relation extraction tasks: SeeDev-binary and BB-rel. We combined several distributed features and a multi-head attention mechanism to automatically extract global semantic features from long and complicated sentences. Syntactic-dependent features were also integrated into the model. As the shortest dependency path connecting 2 entities is short and concise, we apply a CNN to learn its features. We conducted extensive experiments, and our approach achieved <italic>F</italic><sub>1</sub> scores of 65.56% and 38.04% on BB-rel and SeeDev-binary tasks and achieved state-of-the-art performance on the SeeDev-binary task.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>The BB-rel task was conducted 3 times [<xref ref-type="bibr" rid="ref34">34</xref>] before, and the fourth edition [<xref ref-type="bibr" rid="ref35">35</xref>] in the BioNLP-OST 2019 focused on extracting information about bacterial biotopes and phenotypes, motivated by the importance of knowledge on biodiversity for theoretical research and applications in microbiology, involving entity recognition, entity normalization, and relation extraction. This edition has been extended to include a new entity type of <italic>phenotype</italic>, relation category of <italic>Exhibits</italic>, and new documents. We mainly studied one of the subtasks, the relation extraction task (BB-rel), which is to predict the relationship of <italic>Lives_In</italic> category between <italic>microorganism</italic>s, <italic>habitat</italic>s, and <italic>geographic</italic> entities, and the relation of <italic>Exhibits</italic> category between <italic>microorganism</italic> and <italic>phenotype</italic> entities from PubMed abstracts and full-text excerpts, where entity annotation has been provided. Many researchers have contributed their efforts to the BB-rel task and have proposed innovative methods. For example, in Biomedical Natural Language Processing Workshop 2016, TurkuNLP team used the method of the shortest dependent path using the Turku event extraction system (TEES) [<xref ref-type="bibr" rid="ref12">12</xref>] and 3 long short-term memory (LSTM) units, achieving an <italic>F</italic><sub>1</sub> score of 52.10% [<xref ref-type="bibr" rid="ref31">31</xref>]. The bidirectional gated recurrent unit-Attn team proposed a bidirectional gated recurrent unit with an attention model, with an <italic>F</italic><sub>1</sub> score of 57.42% [<xref ref-type="bibr" rid="ref36">36</xref>]. Amarin et al [<xref ref-type="bibr" rid="ref33">33</xref>] combined feature combinations with an attention model and contextual representations to achieve a state-of-the-art performance with an <italic>F</italic><sub>1</sub> score of 60.77%. In BioNLP-OST 2019, almost all researchers used neural network models in various architectures. For instance, the Yuhang_Wu team used a multilayer perceptron and achieved an <italic>F</italic><sub>1</sub> score of 60.49% on the test set. The highest <italic>F</italic><sub>1</sub> score was 66.39%, which was submitted by the whunlp team [<xref ref-type="bibr" rid="ref37">37</xref>]. They constructed a dependency graph based on lexical association, and used bidirectional LSTM (BiLSTM) [<xref ref-type="bibr" rid="ref38">38</xref>] and an attention graph convolution neural network to detect the relation. In addition, the AliAI team innovatively used a multitask architecture similar to <italic>Bidirectional Encoder Representations from Transformers</italic> (BERT) and achieved 64.96%, which effectively alleviated the lack of information in the domain-specific field [<xref ref-type="bibr" rid="ref39">39</xref>].</p>
        <p>The SeeDev task [<xref ref-type="bibr" rid="ref40">40</xref>] aims to facilitate the extraction of complex events on regulations in plant development from scientific articles, with a focus on events describing the genetic and molecular mechanisms involved in <italic>Arabidopsis thaliana</italic> seed development. The SeeDev task involves extracting 21 relation categories, involving 16 entity types, to accurately reflect the complexity of the regulatory mechanisms of seed development, which is a major scientific challenge. SeeDev was originally proposed at BioNLP-OST 2016 [<xref ref-type="bibr" rid="ref6">6</xref>], and in 2019, the evaluation methodology focused more on the contribution of biology. It includes full and binary relation extraction, in which we mainly study the binary relation extraction subtask SeeDev-binary. To address this problem, most researchers have used traditional supervised machine learning approaches. These systems design artificial templates or manually extract many features based on domain-specific knowledge, such as linguistic features, semantic features, and syntactic information, which are added to the system as feature representations. Kernel-based machine learning algorithms such as SVM and Bayesian are then used to detect the relation categories, which are widely used for IE. For instance, the UniMelb team [<xref ref-type="bibr" rid="ref41">41</xref>] developed an event extraction system using rich feature sets and SVM classifiers with a linear kernel. In addition, the MIC-CIS team [<xref ref-type="bibr" rid="ref42">42</xref>] used an SVM combined with linguistic features to achieve optimal results on BioNLP-OST 2019. As the DL model gradually became the main research method, the DUTIR team [<xref ref-type="bibr" rid="ref13">13</xref>] innovatively used a DL model based on distributed features and a CNN model [<xref ref-type="bibr" rid="ref15">15</xref>]. The YNU-junyi team [<xref ref-type="bibr" rid="ref14">14</xref>] integrated the LSTM model [<xref ref-type="bibr" rid="ref18">18</xref>] based on a CNN model to address the problem that CNN alone cannot capture the long-range dependence of sequences, and they obtained an <italic>F</italic><sub>1</sub> score of 34.18% on the SeeDev-binary task of BioNLP-OST 2019.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>In this section, we describe our proposed model for the 2 biomedical relation extraction tasks in detail. The overall architecture is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. The preprocessing of the data sets is described in the first part. In the second part, we introduce a series of distributed semantic features used in our method, and the multi-head attention mechanism used on them is introduced in the third part. The fourth part explains the construction of the syntactic dependency feature. In the fifth part, we introduce the classification and training details. Finally, we present the training and hyperparameter settings.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The overall architecture of our proposed model with global semantic feature based on feature combination and multi-head attention as well as syntactic dependency feature. Dist_1: distance embedding corresponding to the first entity in a sentence; Dist_2: distance embedding corresponding to the second entity in a sentence; entity type: entity type embedding; POS: part-of-speech embedding; Word: word embedding.</p>
          </caption>
          <graphic xlink:href="medinform_v10i10e41136_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Preprocessing</title>
        <p>In the data preprocessing phase, we used TEES [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref31">31</xref>] to run a text preprocessing pipeline. The TEES system splits the text into sentences using the GENIA Sentence Splitter [<xref ref-type="bibr" rid="ref43">43</xref>] and parses the sentences through the integrated the Brown Laboratory for Linguistic Information Processing parser [<xref ref-type="bibr" rid="ref44">44</xref>] with the biomedical domain model [<xref ref-type="bibr" rid="ref45">45</xref>] to obtain the tokens, POS tags, and parse graphs for each word. Then, the phrase structure trees obtained by the parser are further processed using the Stanford conversion tool [<xref ref-type="bibr" rid="ref46">46</xref>] to obtain the syntactic dependency graph.</p>
        <p>The BB-rel and SeeDev-binary tasks are relation extraction tasks, which detect whether and what relations exist between 2 annotated entities in biomedical texts. For example, in the sentence “The percentage of penicillin-resistant <italic>N. gonorrhoeae</italic> isolated in the region over the decade varied considerably,” in which <italic>N. gonorrhoeae</italic> is a microorganism-type entity and “percentage” is a phenotype-type entity, we need to detect whether there is a relationship between them and the category of the relation. There are usually 2 solutions to the relation extraction task: the first is to identify whether there is a relation between entity pairs in a sentence and then classify a correct category [<xref ref-type="bibr" rid="ref47">47</xref>], and the second method is to combine the 2 steps of identification and classification into 1 step [<xref ref-type="bibr" rid="ref13">13</xref>]. This paper adopts the second method, which regards nonrelation as a category of relationships and carries out multi-category classification.</p>
        <p>In the training and validation sets of the BB-rel and SeeDev-binary tasks, only positive instances were labeled. However, in the prediction phase, there may be a nonrelation between 2 candidate entities; therefore, it is necessary to manually construct negative instances in the training phase. After the biomedical texts are divided into sentences, we enumerate each entity pair in the sentence and judge the unlabeled instances as nonrelational. Because the biomedical relation extraction of SeeDev-binary and BB-rel tasks is under the constraint of regulation, there must be no relation between some entity types. For example, in the BB-rel task, there must be no biomedical relation between the entity of <italic>geographic</italic> type and the entity of <italic>phenotype</italic> type. Therefore, we need to further eliminate the entity pairs that do not comply with the regulations.</p>
        <p>In the data sets of the 2 tasks, not only do the entities of a relation appear in the same sentence (intrasentence) but also the entities of a relation may be in different sentences (intersentence), which is a great challenge regarding biomedical relation extraction tasks [<xref ref-type="bibr" rid="ref35">35</xref>]. In our method, we only considered intrasentence relations and ignored intersentence relations. There are 2 difficulties involved in the intersentence relation: one is that the reasoning relationship is difficult and complex; the other is that the number of negative instances increases exponentially, which leads to an extreme imbalance of positive and negative samples, resulting in performance degradation of the model. Therefore, all existing systems only extract intrasentence relations without considering intersentence relations [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. In addition, an instance is eliminated if there is no syntactic dependency path between the 2 candidate entities.</p>
      </sec>
      <sec>
        <title>Distributed Semantic Representation</title>
        <p>Our method extracts global semantic features from a full sentence through a combination of several distributed features and a multi-head attention mechanism. Domain-specific word embedding, POS embedding, entity-type embedding, distance embedding, and PE were integrated into our model.</p>
        <p>Word embedding is a frequently used distributed representation model that encodes rich semantic information into vectors. The sequence of a full sentence of length <italic>n</italic> can be represented as <italic>{w<sub>1</sub>,e<sub>1</sub>,...,e<sub>2</sub>,w<sub>n</sub>}</italic>, where <italic>e<sub>1</sub></italic> and <italic>e<sub>2</sub></italic> represent entity pairs. We initialized our word embeddings with a pretrained 200-dimensional biomedical word embedding model [<xref ref-type="bibr" rid="ref23">23</xref>], which was trained on PubMed and PMC abstracts, and full texts contained an unannotated corpus of 5 billion tokens. The pretrained embedding model was trained using the word2vec tool with the skip-gram model [<xref ref-type="bibr" rid="ref48">48</xref>]. We only used the most frequent 100k words to build dictionary <italic>D</italic>, and the unknown words in the data sets were randomly initialized. Taking the BB-rel task as an example, it is possible that the words of entity are not in dictionary <italic>D</italic>, so we add the words “Microorganism,” “Habitat,” “Geographical,” and “phenotype” to the dictionary and initialize them randomly. If an entity is of <italic>microorganism</italic> type and is not in the word embedding model, it will be replaced by the word “Microorganism.” Through the pretrained word embedding matrix, we can transform the sequence of tokens in a full sentence into a vector sequence <inline-graphic xlink:href="medinform_v10i10e41136_fig4.png" xlink:type="simple" mimetype="image"/>. We also used POS embedding [<xref ref-type="bibr" rid="ref24">24</xref>] to encode the POS for words in a sentence, which usually plays an important role. The POS embedding was randomly initialized and fine-tuned during the training phase.</p>
        <p>The combination of different types of entities has different probabilities for some relations; therefore, the entity type is an important factor for prediction [<xref ref-type="bibr" rid="ref13">13</xref>]. As the 2 biomedical relation extraction tasks are conditionally constrained, they do not involve the direction between entity pairs, so the entity-type sequence only needs one chain to represent. Therefore, the entity-type sequence can be expressed as {−1,<italic>t<sub>1</sub>,...,t<sub>2</sub>,−1}</italic>, where nonentity words are labeled as −1. Through a randomly initialized type embedding matrix, the entity-type vector sequence can be represented as <inline-graphic xlink:href="medinform_v10i10e41136_fig5.png" xlink:type="simple" mimetype="image"/>.</p>
        <p>The distance sequence is divided into 2 chains, namely, the distance from the current word to the 2 candidate entities. In our method, relative distance [<xref ref-type="bibr" rid="ref25">25</xref>] is used to measure the distance between the current word and an entity, which can be formulated as equation 1, where <italic>l</italic> is the absolute distance and <italic>s</italic> is the maximum distance in the data sets. As the relative distance is not an integer, it is necessary to construct a distance dictionary and use the distance embedding matrix to generate the distance-vector sequence.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v10i10e41136_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>As we use the multi-head attention model to deal with the combination of a series of distributed features without using any time series model, we have to inject some absolute position information of words into the model; therefore, we introduce PE with reference as shown in the study by Vaswani et al [<xref ref-type="bibr" rid="ref30">30</xref>]. In our method, the PE vectors have the same dimension <italic>d<sub>word</sub></italic> as the word embedding, and then PE vectors can be calculated according to the sine and cosine functions of the frequencies. The formulas are given in equations 2 and 3, where <italic>pos</italic> is the position and <italic>i</italic> represents the <italic>i</italic>-th dimension of one word. Finally, the position information was injected into the model by adding the PE vector into the word embedding.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v10i10e41136_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Finally, a series of distributed features is concatenated, and each word <italic>w<sub>i</sub></italic> in the sentence can be represented as <inline-graphic xlink:href="medinform_v10i10e41136_fig8.png" xlink:type="simple" mimetype="image"/>. This comprehensive distributed feature is sent to the multi-head attention layer to extract the global semantic features of the full sentence.</p>
      </sec>
      <sec>
        <title>Multi-Head Attention Layer</title>
        <p>In recent years, a series of attention-based models have been applied to relationship extraction tasks with remarkable success [<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. The core idea of the attention mechanism is to locate key information from text by assigning attention scores. At present, the most widely used attention models are additive attention [<xref ref-type="bibr" rid="ref26">26</xref>] and dot-product attention [<xref ref-type="bibr" rid="ref30">30</xref>]. In the study by Vaswani et al [<xref ref-type="bibr" rid="ref30">30</xref>], the multi-head attention mechanism was proposed as the main component unit of the transformer model. In this model, attention can be used to compute the output of a series of values through value mapping to a set of key-value pairs, that is, to calculate a weighted sum of the values, where the weight assigned to each value is computed by a query with the corresponding key. In our method, the multi-head attention mechanism is used as an encoder to extract the global semantic feature of the full sentence, and each attention head is calculated by integrating the position information and using the scaled dot-product attention function.</p>
        <p>The overall structure of scaled dot-product attention and multi-head attention is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>, similar to that shown in the study by Vaswani et al [<xref ref-type="bibr" rid="ref30">30</xref>]. Here, Q, K, and V are the same, which are the feature combinations from the full sentence; therefore, multi-head attention can also be understood as a form of self-attention. Eight attention heads based on scaled dot-product attention were used to extract features, which divided feature combinations into 8 channels. For each channel, the embedding of each word in the sentence with length <italic>n</italic> can be expressed as <italic>z<sub>i</sub></italic>. Through the weights <italic>(W<sub>q,</sub> W<sub>k</sub>, W<sub>v</sub>)</italic> that are not shared between channels, we can get the vector expression of a word in different subspaces, namely <italic>(q<sub>i</sub>, k<sub>i,</sub> v<sub>i</sub>),</italic> as shown in equation 4.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v10i10e41136_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>The attention weight vector <italic>a<sub>i</sub></italic> corresponding to <italic>i</italic>-th query is calculated by the dot product of the query vector and key vector and then scaled by <inline-graphic xlink:href="medinform_v10i10e41136_fig10.png" xlink:type="simple" mimetype="image"/> and calculated by a Softmax function, where <italic>d<sup>k</sup></italic> is the dimensionality of the feature combination and <italic>n</italic> is the length of the sentence, as shown in equation 5.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v10i10e41136_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>By multiplying the attention weight vector <italic>a<sub>i</sub></italic> by the value sequence of length <italic>n</italic>, a feature vector <italic>c<sub>i</sub></italic> is obtained, which is a weighted sum of the values, as shown in equation 6.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v10i10e41136_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Therefore, the attention head of each channel is a concatenated matrix of <italic>n</italic> feature vectors, which can be expressed as <italic>h<sub>i</sub></italic> using equation 7. Each attention head can encode the semantic information of a sentence in subspaces with different representations.</p>
        <disp-formula><italic>h<sub>i</sub></italic> = [<italic>c<sub>1</sub>;c<sub>2</sub>;...;c<sub>n</sub></italic>] <bold>(7)</bold></disp-formula>
        <p>Furthermore, we concatenated multiple attention heads in the last dimension to obtain the multi-head attention feature of the full sentence, as shown in equation 8.</p>
        <disp-formula><italic>MultiHead</italic> = [<italic>h<sub>1</sub>;h<sub>2</sub>;...;h<sub>8</sub></italic>] <bold>(8)</bold></disp-formula>
        <p>Similar to the transformer model, we also used a fully connected neural network behind the multi-head attention model and used a residual join, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. Finally, the global semantic features of the full sentence are obtained using a max-pooling operation.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Scaled dot-product attention function (left). Multi-head attention consists of several scaled dot-product attention (right). Concat: concatenate; K: key; Matmul: matrix multiply; Q: query; V: value.</p>
          </caption>
          <graphic xlink:href="medinform_v10i10e41136_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Syntactic Dependency Feature</title>
        <p>The syntactic dependency features for the proposed DL model are generated based on the shortest dependency path connecting 2 candidate entities and the dependency type in the dependency graph. The shortest dependency path contains the most important terms related to characterizing the extraction and has been successfully applied in relation extraction many times [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]. An example of syntactic dependency is shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, where “Enterococcus” is a <italic>microorganism-type</italic> entity and “Gram-positive” is a <italic>phenotype</italic>-type entity. We can observe that the dependency parse between the words is directional. To simplify the calculation, we use the method by Mehryary et al [<xref ref-type="bibr" rid="ref31">31</xref>] to convert the dependency relation of a sentence into an undirected graph and then find the shortest path between 2 candidate entities using the Dijkstra algorithm. In the case of BB-rel task, we always process from <italic>a microorganism</italic>-type entity to location entities (either a <italic>habitat</italic> or a <italic>geographic</italic> entity) or <italic>phenotype</italic> entity, regardless of their positions in sentences. Therefore, in the example in <xref rid="figure3" ref-type="fig">Figure 3</xref>, the shortest dependency path sequence is (“Enterococcus,” “cause,” “infection,” “Gram-positive”) and the dependency-type sequence is (nsubj, prep_of, amod).</p>
        <p>In this case, the sequence of the shortest dependency path with <italic>m</italic> tokens can be represented as <italic>{e<sub>1</sub>,w<sub>2</sub>,...,e<sub>2</sub>}</italic>, where <italic>e<sub>1</sub></italic> and <italic>e<sub>2</sub></italic> represent the entity pairs at the head and end of the sequence, respectively. We used the previously mentioned pretrained 200-dimensional biomedical word embedding model [<xref ref-type="bibr" rid="ref23">23</xref>]. Using the pretrained word embedding model, we can transform the dependent path sequence into a vector sequence <inline-graphic xlink:href="medinform_v10i10e41136_fig13.png" xlink:type="simple" mimetype="image"/>. For the dependent-type sequence <italic>{t<sub>1</sub>,t<sub>2</sub>,...,t<sub>m−1</sub>}</italic>, we transform it into <inline-graphic xlink:href="medinform_v10i10e41136_fig14.png" xlink:type="simple" mimetype="image"/> by randomly initializing the embedding matrix and filling it to the same length as the dependency path. The 2 vector sequences are concatenated, and <italic>i</italic>-th word can be denoted as <inline-graphic xlink:href="medinform_v10i10e41136_fig15.png" xlink:type="simple" mimetype="image"/>.</p>
        <p>To learn the local features of syntactic dependency from the dependency path and dependency type, LSTM [<xref ref-type="bibr" rid="ref53">53</xref>] are the most frequently used DL models. By observing the length of the shortest dependency path, it is found that most of the interentity dependency lengths are 2 to 5, which belongs to the feature extraction of super-short sequences. Compared with LSTM, CNN is more suitable for super-short and concise sequences (Yin, W, unpublished data, February 2017). In addition, CNN are more suitable for parallel computing. Hence, we introduced a multifilter CNN model [<xref ref-type="bibr" rid="ref54">54</xref>] and a max-pooling operation to learn syntactic dependency features, which has the advantage of learning hidden and advanced features from sentences with multiple channels.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>An example of syntactic dependency between phenotype-type entity “Enterococcus” and phenotype-type entity “Gram-positive”; solid lines are entity dependencies, and dashed lines are irrelevant dependencies. advmod: adverbial modifier; amod: adjectival modifier; cop: copula; det: determiner; nsubj: nominal subject; prep_of: preposition of.</p>
          </caption>
          <graphic xlink:href="medinform_v10i10e41136_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Classification and Training</title>
        <p>In the output layer, we concatenate the global semantic feature vector and syntactic-dependent feature vector of the sentence to obtain a high-quality feature representation of the instance. Furthermore, the dropout algorithm [<xref ref-type="bibr" rid="ref55">55</xref>] is used to prevent overfitting, the Softmax function is used to classify biomedical relations, and the probability distribution over each relation category is obtained.</p>
        <p>The 2 tasks included a training set, validation set, and test set. In the training phase, taking the multi-classification cross entropy as the objective function, the Adaptive moment estimation optimization algorithm [<xref ref-type="bibr" rid="ref56">56</xref>] with a learning rate of 0.001 was used to update the neural network parameters. The training times determine the generalization performance of the model; that is, too few training epochs lead to underfitting, and overtraining leads to overfitting. Therefore, the traditional early stopping method is adopted in our method, that is, training is stopped when the performance on the validation set is no longer improved. The experimental results show that the training epoch number is not a fixed value and that the model generally converges in approximately 4 epochs.</p>
        <p>The data sets of the 2 biomedical relation extraction tasks were relatively small, and the DL model had more training parameters. Consequently, the initial random state of the model may have a significant impact on the final performance of the model, which was verified by a pre-experiment. To reduce the impact of the initialization state on the model, 10 different random initializations were used to evaluate the model, which was to train the same model structure with different random seeds. Finally, the model with the best <italic>F</italic><sub>1</sub> score on the validation set was used as the final model. We used the final model to predict the test set and used the results to evaluate our model on a web-based evaluation service.</p>
      </sec>
      <sec>
        <title>Parameter Settings</title>
        <p>Through the pre-experiment and evaluation based on the validation set, the hyperparameters of our model were determined. The dimensions of domain-specific word embedding, POS embedding, entity-type embedding, distance embedding, PE, and dependency-type embedding were 200, 200, 200, 100, 200, and 200, respectively, and the embedding matrix was fine-tuned during the training phase. For the multi-head attention mechanism, we adopted a single-layer multi-head attention model, in which 8 parallel attention heads were used, and the number of units in the linear layer of each attention head was the same as the input. To extract the syntactic dependency feature, the number of convolution layers was 1, the number of filters was set to 128, and the window sizes were 2, 3, and 4. In addition, the LSTM model was used in the experiment, and the output dimension of the hidden units was set as 128. For the combination of global semantic features and syntactic dependency features, the dropout rate was 0.5. The batch size was set to 8. Finally, we used the DL framework Pytorch [<xref ref-type="bibr" rid="ref57">57</xref>] to implement our model and carry out the experimental process.</p>
      </sec>
      <sec>
        <title>Ethics Approval</title>
        <p>The data set and methods used in this work are publicly available and do not involve any ethical or moral issues.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Data Set and Evaluation Metrics</title>
        <p>We conducted a series of experiments on the BB-rel and SeeDev-binary task data sets to evaluate our proposed approach.</p>
        <p>The BB-rel task in BioNLP-OST 2019 is quite different from the previous versions, which integrate the new entity type of <italic>phenotype</italic> and relation category of <italic>Exhibits</italic>. Therefore, this task involves 4 entity types, <italic>microorganism</italic>, <italic>habitat</italic>, <italic>geography</italic>, and <italic>phenotype</italic>, and 2 relation categories between entity pairs, <italic>Lives_In</italic> and <italic>Exhibits</italic>. In practice, the nonrelation between entity pairs is also regarded as a prediction category, so this task is treated as a multi-classification relation extraction task. In addition to intrasentence relations, the BB-rel task also considers intersentence relations, which remains a significant challenge. The proportion of intersentence relationships in the corpus was 17.5%. In our method, we consider only the intrasentence relationship. We adopted the method described in the data preprocessing section to segment the text into sentences, construct negative instances, and remove instances that do not comply with the constraint of regulation. In this manner, we constructed 1996 training instances, including 943 related instances; 1040 validation instances, including 517 related instances; and 1414 test instances. The detailed distribution of the BB-rel task data set after the preprocessing procedure is summarized in <xref ref-type="table" rid="table1">Table 1</xref>. Owing to different data revision and processing methods, the number of instances may be inconsistent with other studies.</p>
        <p>We used the predictions of the test set to evaluate our methods on the web-based evaluation service [<xref ref-type="bibr" rid="ref58">58</xref>]. Its evaluation metrics are similar to those of previous versions, including precision, recall, <italic>F</italic><sub>1</sub> score, and the results of the intrasentence and intersentence relations of various relation categories [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <p>The SeeDev-binary task corpus is a set of 87 paragraphs from 20 full articles on the seed development of <italic>Arabidopsis thaliana</italic>, with 17 entity types and 22 relation categories manually annotated by domain experts. There are 3575 annotated relations, including 1628 relations for the training sets, 819 relations for the validation sets, and 1128 relations for the test sets. We used the same method to preprocess the data set and eliminate intersentence relations. Then, 18,997 training instances were constructed, including 1508 related instances; 8955 validation instances were constructed, including 746 related instances; and 12,737 test instances were constructed, and the detailed distribution is shown in <xref ref-type="table" rid="table2">Table 2</xref>. It can be seen that there is an extreme imbalance where the number of nonrelation samples far exceeds the positive samples, which is more challenging and will negatively affect the performance of the model [<xref ref-type="bibr" rid="ref47">47</xref>]. Therefore, to alleviate this problem, through a series of pre-experiments, we finally decided to randomly delete 90% (15,740/17,489) of the negative samples in the training stage, but the validation and test sets were not reduced.</p>
        <p>The SeeDev-binary is also applicable to the web-based evaluation services. Compared with SeeDev-binary 2016, task organizers have added new evaluation metrics to emphasize biomedical contributions. The evaluation metrics are global results for all relations, the results of intrasentence relations, and type clusters, each of which has a precision, recall, and <italic>F</italic><sub>1</sub> score.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Detailed statistics of the relation extraction of Bacteria-Biotope task data set. The statistics of the test set is none because the organizer has not released the annotated relation on the test set.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="330"/>
            <col width="260"/>
            <col width="260"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Category</td>
                <td>Training set</td>
                <td>Validation set</td>
                <td>Test set</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Total</td>
                <td>1996</td>
                <td>1040</td>
                <td>1414</td>
              </tr>
              <tr valign="top">
                <td>Lives_in</td>
                <td>659</td>
                <td>377</td>
                <td>None</td>
              </tr>
              <tr valign="top">
                <td>Exhibits</td>
                <td>284</td>
                <td>140</td>
                <td>None</td>
              </tr>
              <tr valign="top">
                <td>Lives_in and Exhibits</td>
                <td>943</td>
                <td>517</td>
                <td>None</td>
              </tr>
              <tr valign="top">
                <td>Nonrelation</td>
                <td>1053</td>
                <td>523</td>
                <td>None</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Detailed statistics of the binary relation extraction of plant seed development task data set. The number of relationships in the test set is none because the number of relationships cannot be determined after preprocessing.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="270"/>
            <col width="290"/>
            <col width="290"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Category</td>
                <td>Training set</td>
                <td>Validation set</td>
                <td>Test set</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Total</td>
                <td>18,997</td>
                <td>8955</td>
                <td>12,737</td>
              </tr>
              <tr valign="top">
                <td>All relation</td>
                <td>1508</td>
                <td>746</td>
                <td>None</td>
              </tr>
              <tr valign="top">
                <td>Nonrelation</td>
                <td>17,489</td>
                <td>8209</td>
                <td>None</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Experiment Results</title>
        <p>In the BB-rel task, we used the proposed DL model based on the multi-head attention mechanism and syntactic dependency feature to detect biomedical relations. Our proposed method finally obtained an <italic>F</italic><sub>1</sub> score of 65.56% on the test set; the details are shown in <xref ref-type="table" rid="table3">Table 3</xref>. Our method has an <italic>F</italic><sub>1</sub> scores of 62.36% and 73.62% for the relation category of <italic>Lives_In</italic> and <italic>Exhibits</italic>, respectively, and performs better in the relation category <italic>Exhibits</italic>. Moreover, it can be noted that the <italic>F</italic><sub>1</sub> scores in the identification of intrasentence relations of Lives_In and Exhibits are 69.00% and 77.67%, which are higher than the comprehensive <italic>F</italic><sub>1</sub> score. This is because our preprocessing method only deals with intrasentence relations; therefore, it performs better in the identification of intrasentence relations.</p>
        <p><xref ref-type="table" rid="table4">Table 4</xref> lists the comparison between our method and other previous systems in BB-rel task. The first 3 lines in the table are the official top 3 systems (10 participated), among which Yuhang_Wu used a multilayer perceptron [<xref ref-type="bibr" rid="ref35">35</xref>], AliAI [<xref ref-type="bibr" rid="ref39">39</xref>] used a multitask architecture similar to BERT, and whunlp [<xref ref-type="bibr" rid="ref37">37</xref>] achieves state-of-the-art performance by using dependency graph and attention graph convolution neural network. The fourth line is the baseline provided by the task organizer, which uses a co-occurrence method. Owing to the huge difference between the model architecture of these systems, only the final <italic>F</italic><sub>1</sub> score is used for comparison. The <italic>F</italic><sub>1</sub> score of our method is 5.07% higher than the third-placed Yuhang_Wu and 0.60% superior to the second-placed AliAI, who achieved the result of 64.96%. It is worth noting that our model achieved the best precision of 69.50%, which is superior to all existing systems in BB-rel task. This result reveals that our method tends to predict fewer positive classes, that is, it performs better on false positives than other models. In conclusion, this comparison indicates that our proposed model is effective and achieved excellent performance in BB-rel task.</p>
        <p>In the SeeDev-binary task, our proposed method achieved an <italic>F</italic><sub>1</sub> score of 38.04% for all relations in the test set. The detailed results for the specific relation categories are shown in <xref ref-type="table" rid="table5">Table 5</xref>. As shown in the table, 7 types of relation categories were not detected, such as <italic>Is_Involved_In_Process</italic> and <italic>Occurs_During</italic>. Through the statistical analysis of the data set, it was found that there were few positive instances of these relation categories in the training set, which was obviously responsible for the uneven classification.</p>
        <p><xref ref-type="table" rid="table6">Table 6</xref> lists the results of comparison between our method and other systems for the SeeDev-binary task. The first 2 systems are the top 2 of the official ranks in BioNLP-OST 2019. Among them, the first-placed MIC-CIS [<xref ref-type="bibr" rid="ref42">42</xref>] used linguistic feature and SVM classifier to achieve an <italic>F</italic><sub>1</sub> score of 37.38%, whereas YNU-junyi [<xref ref-type="bibr" rid="ref14">14</xref>], the second-ranking system, obtained an <italic>F</italic><sub>1</sub> score of 34.18% using a DL model combined with distributed representation, CNN and LSTM model. The results show that our method achieves the state-of-the-art performance in both category of all relation and intrasentence relation, with <italic>F</italic><sub>1</sub> scores of 38.04% and 38.68%, respectively. In the all-relation category, the <italic>F</italic><sub>1</sub> score of our system outperformed the first-ranking system by 0.66% and the second-ranking system by 3.86%. Meanwhile, the result is similar to BB-rel task; our system performed excellently in precision. In All relation and intrasentence relation, the precision surpassed the first-ranking system by 7.30% and 5.30%, respectively. This once again proves that our model has a lower false-positive rate than other models. Therefore, we can conclude that our model can take advantage of both the multi-head attention mechanism and syntactic dependency feature to achieve excellent performance in biomedical relation extraction tasks.</p>
        <p>The results by cluster are also important evaluation metrics in the SeeDev-binary task, and the comparison of <italic>F</italic><sub>1</sub> scores is shown in <xref ref-type="table" rid="table7">Table 7</xref>. It can be seen from the table that our model achieves optimal results in 3 cluster categories: <italic>function</italic>, <italic>regulation</italic>, and <italic>genic regulation,</italic> and it performs poorly in 2 cluster categories: <italic>composition membership</italic> and <italic>interaction</italic>, but the overall performance of our proposed model is generally satisfactory.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Detailed results of our method on the test set of relation extraction of Bacteria-Biotope task.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="470"/>
            <col width="200"/>
            <col width="140"/>
            <col width="190"/>
            <thead>
              <tr valign="top">
                <td>Category</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub> score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Lives_In and Exhibits</td>
                <td>69.50</td>
                <td>62.05</td>
                <td>
                  <italic>65.</italic>
                  <italic>56</italic>
                  <sup>a</sup>
                </td>
              </tr>
              <tr valign="top">
                <td>Lives_In</td>
                <td>69.38</td>
                <td>56.64</td>
                <td>62.36</td>
              </tr>
              <tr valign="top">
                <td>Lives_In (intrasentence)</td>
                <td>69.75</td>
                <td>68.27</td>
                <td>69.00</td>
              </tr>
              <tr valign="top">
                <td>Exhibits</td>
                <td>69.77</td>
                <td>77.92</td>
                <td>73.62</td>
              </tr>
              <tr valign="top">
                <td>Exhibits (intrasentence)</td>
                <td>70.18</td>
                <td>86.96</td>
                <td>77.67</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>The final <italic>F</italic><sub>1</sub> score is shown in italics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Comparison of results between our method and other systems for the relation extraction of Bacteria-Biotope task.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="380"/>
            <col width="230"/>
            <col width="180"/>
            <col width="210"/>
            <thead>
              <tr valign="top">
                <td>Models</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub> score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>whunlp [<xref ref-type="bibr" rid="ref37">37</xref>]</td>
                <td>62.94</td>
                <td>
                  <italic>70.22</italic>
                  <sup>a</sup>
                </td>
                <td>
                  <italic>66.38</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>AliAI [<xref ref-type="bibr" rid="ref39">39</xref>]</td>
                <td>68.20</td>
                <td>62.01</td>
                <td>64.96</td>
              </tr>
              <tr valign="top">
                <td>Yuhang_Wu [<xref ref-type="bibr" rid="ref35">35</xref>]</td>
                <td>55.10</td>
                <td>67.03</td>
                <td>60.49</td>
              </tr>
              <tr valign="top">
                <td>Baseline [<xref ref-type="bibr" rid="ref35">35</xref>]</td>
                <td>52.54</td>
                <td>80.13</td>
                <td>63.47</td>
              </tr>
              <tr valign="top">
                <td>Our model</td>
                <td>
                  <italic>69.50</italic>
                </td>
                <td>62.05</td>
                <td>65.56</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>The maximum results are shown in italics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Detailed results of our method on the test set of the binary relation extraction of plant seed development task.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="520"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Binary relation type</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub> score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Exists_In_Genotype</td>
                <td>40.59</td>
                <td>32.28</td>
                <td>35.96</td>
              </tr>
              <tr valign="top">
                <td>Occurs_In_Genotype</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Exists_At_Stage</td>
                <td>50.00</td>
                <td>10.00</td>
                <td>16.67</td>
              </tr>
              <tr valign="top">
                <td>Occurs_During</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Is_Localized_In</td>
                <td>38.16</td>
                <td>46.77</td>
                <td>42.03</td>
              </tr>
              <tr valign="top">
                <td>Is_Involved_In_Process</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Transcribes_Or_Translates_To</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Is_Functionally_Equivalent_To</td>
                <td>60.94</td>
                <td>55.71</td>
                <td>58.21</td>
              </tr>
              <tr valign="top">
                <td>Regulates_Accumulation</td>
                <td>66.67</td>
                <td>25.00</td>
                <td>36.36</td>
              </tr>
              <tr valign="top">
                <td>Regulates_Development_Phase</td>
                <td>22.86</td>
                <td>41.56</td>
                <td>29.49</td>
              </tr>
              <tr valign="top">
                <td>Regulates_Expression</td>
                <td>24.65</td>
                <td>50.72</td>
                <td>33.18</td>
              </tr>
              <tr valign="top">
                <td>Regulates_Molecule_Activity</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Regulates_Process</td>
                <td>40.04</td>
                <td>64.71</td>
                <td>49.47</td>
              </tr>
              <tr valign="top">
                <td>Regulates_Tissue_Development</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Composes_Primary_Structure</td>
                <td>60.00</td>
                <td>37.50</td>
                <td>46.15</td>
              </tr>
              <tr valign="top">
                <td>Composes_Protein_Complex</td>
                <td>50.00</td>
                <td>66.67</td>
                <td>57.14</td>
              </tr>
              <tr valign="top">
                <td>Is_Protein_Domain_Of</td>
                <td>26.09</td>
                <td>19.35</td>
                <td>22.22</td>
              </tr>
              <tr valign="top">
                <td>Is_Member_Of_Family</td>
                <td>27.78</td>
                <td>52.33</td>
                <td>36.29</td>
              </tr>
              <tr valign="top">
                <td>Has_Sequence_Identical_To</td>
                <td>100.00</td>
                <td>47.73</td>
                <td>64.62</td>
              </tr>
              <tr valign="top">
                <td>Interacts_With</td>
                <td>80.00</td>
                <td>14.81</td>
                <td>25.00</td>
              </tr>
              <tr valign="top">
                <td>Binds_To</td>
                <td>30.77</td>
                <td>12.50</td>
                <td>17.78</td>
              </tr>
              <tr valign="top">
                <td>Is_Linked_To</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>All relations</td>
                <td>34.75</td>
                <td>42.02</td>
                <td>
                  <italic>38.04</italic>
                  <sup>a</sup>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>The final <italic>F</italic><sub>1</sub> score is shown in italics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Comparison of results between our method and other systems for the binary relation extraction of plant seed development task.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="220"/>
            <col width="130"/>
            <col width="130"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <col width="130"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td>Models</td>
                <td colspan="4">All relation</td>
                <td colspan="3">Intrasentence relation</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub> score</td>
                <td colspan="2">Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub> score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>MIC-CIS [<xref ref-type="bibr" rid="ref42">42</xref>]</td>
                <td>27.45</td>
                <td>
                  <italic>51.15</italic>
                  <sup>a</sup>
                </td>
                <td>37.38</td>
                <td colspan="2">29.45</td>
                <td>
                  <italic>53.08</italic>
                </td>
                <td>37.88</td>
              </tr>
              <tr valign="top">
                <td>YNU-junyi [<xref ref-type="bibr" rid="ref14">14</xref>]</td>
                <td>27.25</td>
                <td>45.83</td>
                <td>34.18</td>
                <td colspan="2">27.25</td>
                <td>47.56</td>
                <td>34.65</td>
              </tr>
              <tr valign="top">
                <td>Our method</td>
                <td>
                  <italic>34.75</italic>
                </td>
                <td>42.02</td>
                <td>
                  <italic>38.04</italic>
                </td>
                <td colspan="2">
                  <italic>34.75</italic>
                </td>
                <td>43.61</td>
                <td>
                  <italic>38.68</italic>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>The maximum results are shown in italics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Comparison of <italic>F</italic><sub>1</sub> scores by cluster between our method and other systems for the binary relation extraction of plant seed development task.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="160"/>
            <col width="100"/>
            <col width="120"/>
            <col width="120"/>
            <col width="120"/>
            <col width="140"/>
            <col width="120"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Models</td>
                <td>All</td>
                <td>Comparison</td>
                <td>Function</td>
                <td>Regulation</td>
                <td>Genic regulation</td>
                <td>Composition membership</td>
                <td>Interaction</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>MIC-CIS [<xref ref-type="bibr" rid="ref42">42</xref>]</td>
                <td>37.38</td>
                <td>47.92</td>
                <td>17.39</td>
                <td>34.78</td>
                <td>33.84</td>
                <td>
                  <italic>40.25</italic>
                  <sup>a</sup>
                </td>
                <td>
                  <italic>34.24</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>YNU-junyi [<xref ref-type="bibr" rid="ref14">14</xref>]</td>
                <td>34.18</td>
                <td>
                  <italic>50.45</italic>
                </td>
                <td>25.00</td>
                <td>34.21</td>
                <td>23.00</td>
                <td>34.68</td>
                <td>21.87</td>
              </tr>
              <tr valign="top">
                <td>Our method</td>
                <td>
                  <italic>38.04</italic>
                </td>
                <td>49.68</td>
                <td>
                  <italic>25.53</italic>
                </td>
                <td>
                  <italic>40.78</italic>
                </td>
                <td>
                  <italic>34.04</italic>
                </td>
                <td>32.72</td>
                <td>22.02</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup>The maximum results are shown in italics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Overview</title>
        <p>In this section, we construct ablation experiments to analyze the effectiveness of multi-head attention mechanism and syntactic dependency feature. To avoid the instability of a single model, the mean <italic>F</italic><sub>1</sub> score on the test set was used to measure model performance. Subsequently, we conducted an error analysis and manually analyzed the correct and incorrect predictions.</p>
      </sec>
      <sec>
        <title>Effectiveness of Multi-Head Attention Mechanism</title>
        <p>We first analyzed the effectiveness of the multi-head attention mechanism in the global semantic feature extraction of a full sentence compared with the traditional CNN, BiLSTM, and additive attention models [<xref ref-type="bibr" rid="ref26">26</xref>]. All models use the distributed features and syntactic dependency features that we use, such as domain-specific word embedding. Owing to the application of PE in the multi-head attention mechanism, we integrate PE into all models for a fair comparison. <xref ref-type="table" rid="table8">Table 8</xref> shows a comparison of the mean <italic>F</italic><sub>1</sub> scores using various models to encode global semantic features.</p>
        <p>From the table, the first 2 lines are the results of extracting the feature representation of sentences using the CNN or BiLSTM model alone, among which the result of the BiLSTM model was slightly better. A possible explanation is that the length of sentences in instances is generally large, and the CNN model can only process window information and rely on a pooling operation to summarize the overall structure of the sentences. However, the BiLSTM model is more suitable for sequence modeling and encoding longer sequence information using a bidirectional memory network. They were then combined with an additive attention model. Compared with CNN and LSTM models alone, the application of the attention model improved <italic>F</italic><sub>1</sub> scores by 1.82% and 1.22% on BB-rel and 1.31% and 1.11% on SeeDev-binary, respectively. In addition, the performance of CNN with attention exceeds that of BiLSTM with attention on the BB-rel task, possibly because the attention mechanism fills the shortcoming that CNN cannot capture the long-range dependence of sentences. Hence, these results suggest that the attention mechanism can effectively improve the performance of the model by focusing on the key information of the token sequence and learning the overall structure of a sentence.</p>
        <p>Finally, the multi-head attention mechanism is introduced into our model without any CNN or recurrent neural network structure, and the optimal result is achieved. The mean <italic>F</italic><sub>1</sub> score was 63.13% and 36.37% for the 2 tasks, which are 1.11% and 1.24% higher than that of the BiLSTM-attention model and 0.96% and 1.45% higher than that of the CNN-attention model, respectively. The results show that the multi-head attention mechanism significantly outperforms the additive attention model in biomedical relation extraction. To some extent, additive attention can be understood as a single-head attention model that can only learn the global semantic features in one representation space. However, the advantage of the multi-head attention mechanism is that it captures the global semantic information in different representation subspaces and integrates the contextual information of relevant words into the current word from multiple channels. The experimental results demonstrate that the multi-head attention mechanism can extract more comprehensive feature representations and effectively improve the performance of the relation extraction model.</p>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>The comparison of mean <italic>F</italic><sub>1</sub> score of using different models to extract global semantic features in the relation extraction of Bacteria-Biotope task (BB-rel) and the binary relation extraction of plant seed development task (SeeDev-binary).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="160"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td>Global semantic features</td>
                <td colspan="3">BB-rel</td>
                <td colspan="3">SeeDev-binary</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Minimum<sup>a</sup></td>
                <td>Maximum<sup>b</sup></td>
                <td>Mean (SD)</td>
                <td>Minimum<sup>a</sup></td>
                <td>Maximum<sup>b</sup></td>
                <td>Mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>CNN<sup>c</sup></td>
                <td>57.26</td>
                <td>63.26</td>
                <td>60.35 (2.11)</td>
                <td>31.67</td>
                <td>35.85</td>
                <td>33.61 (1.33)</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM<sup>d</sup></td>
                <td>57.89</td>
                <td>63.80</td>
                <td>60.80 (1.88)</td>
                <td>32.39</td>
                <td>36.28</td>
                <td>34.02 (1.53)</td>
              </tr>
              <tr valign="top">
                <td>CNN-attention</td>
                <td>59.69</td>
                <td>65.01</td>
                <td>62.17 (1.69)</td>
                <td>32.89</td>
                <td>37.52</td>
                <td>34.92 (1.47)</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM-attention</td>
                <td>59.80</td>
                <td>64.38</td>
                <td>62.02 (1.45)</td>
                <td>33.61</td>
                <td>37.30</td>
                <td>35.13 (1.18)</td>
              </tr>
              <tr valign="top">
                <td>Multi-head attention</td>
                <td>
                  <italic>60.68</italic>
                  <sup>e</sup>
                </td>
                <td>
                  <italic>65.56</italic>
                </td>
                <td>
                  <italic>63.13 (</italic>
                  <italic>1.55</italic>
                  <italic>)</italic>
                </td>
                <td>
                  <italic>34.47</italic>
                </td>
                <td>
                  <italic>38.04</italic>
                </td>
                <td>
                  <italic>36.37 (1.13)</italic>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table8fn1">
              <p><sup>a</sup>The lowest <italic>F</italic><sub>1</sub>-scores of 10 different random initializations.</p>
            </fn>
            <fn id="table8fn2">
              <p><sup>b</sup>The highest <italic>F</italic><sub>1</sub>-scores of 10 different random initializations.</p>
            </fn>
            <fn id="table8fn3">
              <p><sup>c</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table8fn4">
              <p><sup>d</sup>BilSTM: bidirectional long short-term memory network.</p>
            </fn>
            <fn id="table8fn5">
              <p><sup>e</sup>The maximum results are shown in italics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Effectiveness of Syntactic Dependency Feature</title>
        <p>Furthermore, we analyzed the effectiveness of the syntactic dependency feature in our model. The length of the shortest dependency paths, based on syntactic analysis, is mostly 2 to 5, which belongs to a super-short sequence. Therefore, we only tried to use the CNN and BiLSTM models for feature extraction, and the results are shown in <xref ref-type="table" rid="table9">Table 9</xref>. The first line shows the results that the model does not use syntactic dependency features, and the average <italic>F</italic><sub>1</sub> scores were 60.85% and 34.60% for BB-rel and SeeDev-binary tasks, respectively. When the LSTM model was used to extract syntactic dependency features, the mean <italic>F</italic><sub>1</sub> scores of the model were 62.88% and 36.06%. When we used the CNN model, the performance of the model reached optimal <italic>F</italic><sub>1</sub> scores, which improved to 63.13% and 36.37% on BB-rel and SeeDev-binary tasks, respectively. The results also show that the CNN model is superior to LSTM in terms of feature extraction for super-short sequences. By comparison, it can be demonstrated that the integration of syntactic dependency features can enable the model to learn syntactic information between entity pairs through a dependency graph, which can effectively improve the performance of the model.</p>
        <table-wrap position="float" id="table9">
          <label>Table 9</label>
          <caption>
            <p>The comparison of mean <italic>F</italic><sub>1</sub> scores of using different models to extract syntactic dependency features in the relation extraction of Bacteria-Biotope task (BB-rel) and the binary relation extraction of plant seed development task (SeeDev-binary).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="160"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td>Syntactic dependency feature</td>
                <td colspan="3">BB-rel</td>
                <td colspan="3">SeeDev-binary</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Minimum<sup>a</sup></td>
                <td>Maximum<sup>b</sup></td>
                <td>Mean (SD)</td>
                <td>Minimum<sup>a</sup></td>
                <td>Maximum<sup>b</sup></td>
                <td>Mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>No-use</td>
                <td>58.51</td>
                <td>63.70</td>
                <td>60.85 (1.65)</td>
                <td>32.89</td>
                <td>36.53</td>
                <td>34.60 (1.16)</td>
              </tr>
              <tr valign="top">
                <td>LSTM<sup>c</sup></td>
                <td>59.93</td>
                <td>65.16</td>
                <td>62.88 (1.66)</td>
                <td>
                  <italic>34.55</italic>
                  <sup>d</sup>
                </td>
                <td>37.90</td>
                <td>36.06 (1.07)</td>
              </tr>
              <tr valign="top">
                <td>CNN<sup>e</sup></td>
                <td>
                  <italic>60.68</italic>
                </td>
                <td>
                  <italic>65.56</italic>
                </td>
                <td>
                  <italic>63.13 (1.55)</italic>
                </td>
                <td>34.47</td>
                <td>
                  <italic>38.04</italic>
                </td>
                <td>
                  <italic>36.37 (1.13)</italic>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table9fn1">
              <p><sup>a</sup>The lowest <italic>F</italic><sub>1</sub>-scores of 10 different random initializations.</p>
            </fn>
            <fn id="table9fn2">
              <p><sup>b</sup>The highest <italic>F</italic><sub>1</sub>-scores of 10 different random initializations.</p>
            </fn>
            <fn id="table9fn3">
              <p><sup>c</sup>LSTM: long short-term memory network.</p>
            </fn>
            <fn id="table9fn4">
              <p><sup>d</sup>The maximum results are shown in italics.</p>
            </fn>
            <fn id="table9fn5">
              <p><sup>e</sup>CNN: convolutional neural network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Error Analysis</title>
        <p>To verify the advantages and weaknesses of our proposed model, we compared the experimental results with those of other existing models. We find that our system performs better in terms of the precision of the 2 relation extraction tasks, far surpassing other models, which means that our approach has a lower false-positive rate than the other models. One possible explanation is that our model structure introduces the shortest dependent paths compared with other systems, which can more definitely identify the biomedical relationship between entity pairs.</p>
        <p>The 2 relationship extraction tasks are constrained under regulations; therefore, it is necessary to investigate whether there is a situation in which the predicted relationship does not conform to the rules. For example, in the sentence “An evaluation of selective broths based on the bi-selenite ion and on hypertonic strontium chloride in <italic>Salmonellae</italic> detection in egg products,” the entity “Salmonellae” is of <italic>microorganism</italic> type, and the entity “egg products” is of <italic>habitat</italic> type. There may be a <italic>Lives_In</italic> relationship between them, but if it is predicted as <italic>an Exhibits</italic> relationship, it must be wrong. Through an analysis of the prediction results on the validation set, it was found that this situation rarely occurs. Therefore, our research should focus on whether a biomedical relationship exists between entity pairs.</p>
        <p>In addition, we manually analyzed the correct and false predictions from the validation set compared with existing DL models (structures similar to YNU-junyi [<xref ref-type="bibr" rid="ref14">14</xref>]). We found that our proposed model generally performed better on long sentences. A complicated sentence structure and long distance between 2 entities are more likely to lead to relationship classification errors. For example, in the sentence “The prevalence of H. pylori infection in dyspeptic patients in Yemen is very high, the eradication rate with standard triple therapy was unsatisfactory probably because of widespread bacterial resistance due to unrestricted antibiotic use,” “H. pylori” is a <italic>microorganism</italic> entity, “widespread bacterial resistance due to unrestricted antibiotic use” is <italic>a phenotypic</italic> entity, and there is an <italic>Exhibits</italic> relationship between them. The DL model, similar to YNU-junyi, predicted it as a nonrelationship category, but our model can better detect it, probably because our proposed model can capture the long-term dependency between words in a long sentence.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This paper focuses on the 2 relation extraction tasks in BioNLP-OST 2019: BB-rel task and SeeDev-binary task, which aim to promote the development of fine-grained IE from biomedical texts. For these tasks, we propose a DL model based on the combination of a series of distributed features to detect relations, introduce a multi-head attention mechanism to extract global semantic features, and use syntactic-dependent features to enrich the feature representation. Our proposed method obtained <italic>F</italic><sub>1</sub> scores of 65.56% and 38.04% on the test sets of the 2 tasks and achieved state-of-the-art results in the SeeDev-binary task. Through ablation experiments, the effectiveness of multi-head attention and syntactic dependency features was demonstrated. The multi-head attention mechanism allows the model to learn relevant semantic information in different representation subspaces at different positions and integrates the contextual information of relevant words in the sentence into the current word representation, which greatly improves the performance of the biomedical relation extraction model.</p>
        <p>Despite the excellent performance of our model on BB-rel and SeeDev-binary tasks, there are still many challenges. In particular, the intersentence relation is not considered in our method, which remains a difficult problem in biomedical relation extraction tasks. This situation is because of the complexity of the reasoning relationship and the extreme imbalance between the positive and negative examples. In contrast, the use of a DL model to extract high-quality features from small training data sets is a problem that needs to be solved. In future work, we will consider using a semisupervised learning method or transformer model, such as BERT, to better solve the topic of biomedical relation extraction.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BB-rel</term>
          <def>
            <p>relation extraction of Bacteria-Biotope task</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BiLSTM</term>
          <def>
            <p>bidirectional long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">BioNLP-OST</term>
          <def>
            <p>Biomedical Natural Language Processing Workshop-Open Shared Task</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">DL</term>
          <def>
            <p>deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">IE</term>
          <def>
            <p>information extraction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">PE</term>
          <def>
            <p>position embedding</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">POS</term>
          <def>
            <p>part of speech</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">SeeDev-binary</term>
          <def>
            <p>binary relation extraction of plant seed development task</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">TEES</term>
          <def>
            <p>Turku Event Extraction System</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was supported by the Youth Science and Technology Talent Growth Project of the general university in Guizhou Province (黔教合KY字 [2022] 281号), the Zunyi Science and Technology Cooperation Fund (遵市科合HZ字 [2020] 81号), and the Guizhou Science and Technology Cooperation Platform Talent Fund (黔科合平台人才 [2018] 5772-088, 黔科合平台人才 [2019]-020).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mooney</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Bunescu</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Mining knowledge from text using information extraction</article-title>
          <source>SIGKDD Explor Newsl</source>
          <year>2005</year>
          <month>06</month>
          <day>01</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>10</lpage>
          <pub-id pub-id-type="doi">10.1145/1089815.1089817</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krallinger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Erhardt</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Valencia</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Text-mining approaches in molecular biology and biomedicine</article-title>
          <source>Drug Discov Today</source>
          <year>2005</year>
          <month>03</month>
          <day>15</day>
          <volume>10</volume>
          <issue>6</issue>
          <fpage>439</fpage>
          <lpage>45</lpage>
          <pub-id pub-id-type="doi">10.1016/S1359-6446(05)03376-3</pub-id>
          <pub-id pub-id-type="medline">15808823</pub-id>
          <pub-id pub-id-type="pii">S1359644605033763</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zweigenbaum</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>KB</given-names>
            </name>
          </person-group>
          <article-title>Frontiers of biomedical text mining: current progress</article-title>
          <source>Brief Bioinform</source>
          <year>2007</year>
          <month>09</month>
          <volume>8</volume>
          <issue>5</issue>
          <fpage>358</fpage>
          <lpage>75</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/17977867"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bib/bbm045</pub-id>
          <pub-id pub-id-type="medline">17977867</pub-id>
          <pub-id pub-id-type="pii">bbm045</pub-id>
          <pub-id pub-id-type="pmcid">PMC2516302</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blaschke</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Andrade</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Ouzounis</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Valencia</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Automatic extraction of biological information from scientific text: protein-protein interactions</article-title>
          <source>Proc Int Conf Intell Syst Mol Biol</source>
          <year>1999</year>
          <fpage>60</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="medline">10786287</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Segura-Bedmar</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Martínez</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>de Pablo-Sánchez</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Extracting drug-drug interactions from biomedical texts</article-title>
          <source>BMC Bioinformatics</source>
          <year>2010</year>
          <month>10</month>
          <day>06</day>
          <volume>11</volume>
          <issue>S5</issue>
          <fpage>P9</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(11)00069-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-11-s5-p9</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(11)00069-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nédellec</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bossy</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JD</given-names>
            </name>
          </person-group>
          <article-title>Proceedings of the 4th BioNLP Shared Task Workshop</article-title>
          <year>2016</year>
          <conf-name>BioNLP '16</conf-name>
          <conf-date>August 13, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w16-30</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <source>BioNLP Open Shared Tasks 2019</source>
          <access-date>2022-09-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://2019.bionlp-ost.org/home">https://2019.bionlp-ost.org/home</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kambhatla</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Combining lexical, syntactic, and semantic features with maximum entropy models for extracting relations</article-title>
          <source>Proceedings of the ACL 2004 on Interactive poster and demonstration sessions</source>
          <year>2004</year>
          <conf-name>ACLdemo '04</conf-name>
          <conf-date>July 21-26, 2004</conf-date>
          <conf-loc>Barcelona, Spain</conf-loc>
          <fpage>22</fpage>
          <lpage>es</lpage>
          <pub-id pub-id-type="doi">10.3115/1219044.1219066</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Grishman</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Employing word representations and regularization for domain adaptation of relation extraction</article-title>
          <source>Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</source>
          <year>2014</year>
          <conf-name>ACL '14</conf-name>
          <conf-date>June 22-27, 2014</conf-date>
          <conf-loc>Baltimore, MD, USA</conf-loc>
          <fpage>68</fpage>
          <lpage>74</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/p14-2012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>TV</given-names>
            </name>
            <name name-style="western">
              <surname>Moschitti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Riccardi</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Convolution kernels on constituent, dependency and sequential structures for relation extraction</article-title>
          <source>Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2009</year>
          <month>8</month>
          <conf-name>EMNLP '09</conf-name>
          <conf-date>August 6-7, 2009</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <fpage>1378</fpage>
          <lpage>87</lpage>
          <pub-id pub-id-type="doi">10.3115/1699648.1699684</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A feature-enriched tree kernel for relation extraction</article-title>
          <source>Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</source>
          <year>2014</year>
          <conf-name>ACL '14</conf-name>
          <conf-date>June 22-27, 2014</conf-date>
          <conf-loc>Baltimore, MD, USA</conf-loc>
          <fpage>61</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/p14-2011</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Björne</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Salakoski</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>TEES 2.1: Automated annotation scheme learning in the BioNLP 2013 Shared Task</article-title>
          <source>Proceedings of the BioNLP Shared Task 2013 Workshop</source>
          <year>2013</year>
          <conf-name>BioNLP '13</conf-name>
          <conf-date>August 9, 2013</conf-date>
          <conf-loc>Sofia, Bulgaria</conf-loc>
          <fpage>16</fpage>
          <lpage>25</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W13-2003.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>DUTIR in BioNLP-ST 2016: utilizing convolutional network and distributed representation to extract complicate relations</article-title>
          <source>Proceedings of the 4th BioNLP shared task workshop</source>
          <year>2016</year>
          <conf-name>BioNLP '16</conf-name>
          <conf-date>August 13, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>93</fpage>
          <lpage>100</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W16-3012.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>YNU-junyi in BioNLP-OST 2019: Using CNN-LSTM Model with Embeddings for SeeDev Binary Event Extraction</article-title>
          <source>Proceedings of The 5th Workshop on BioNLP Open Shared Tasks</source>
          <year>2019</year>
          <conf-name>BioNLP '19</conf-name>
          <conf-date>November 4, 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>110</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/D19-5717</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>LeCun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Arbib</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Convolutional networks for images, speech, and time series</article-title>
          <source>The Handbook of Brain Theory and Neural Networks</source>
          <year>1998</year>
          <month>10</month>
          <publisher-loc>Cambridge, MA, USA</publisher-loc>
          <publisher-name>MIT Press</publisher-name>
          <fpage>255</fpage>
          <lpage>8</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Drug-drug interaction extraction via convolutional neural networks</article-title>
          <source>Comput Math Methods Med</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>6918381</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1155/2016/6918381"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2016/6918381</pub-id>
          <pub-id pub-id-type="medline">26941831</pub-id>
          <pub-id pub-id-type="pmcid">PMC4752975</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Relation classification via convolutional deep neural network</article-title>
          <source>Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: Technical Papers</source>
          <year>2014</year>
          <conf-name>COLING '14</conf-name>
          <conf-date>August 23-29, 2014</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <fpage>2335</fpage>
          <lpage>44</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/C14-1220.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hochreiter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidhuber</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Long short-term memory</article-title>
          <source>Neural Comput</source>
          <year>1997</year>
          <month>11</month>
          <day>15</day>
          <volume>9</volume>
          <issue>8</issue>
          <fpage>1735</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id>
          <pub-id pub-id-type="medline">9377276</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dumontier</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Drug-drug interaction extraction via hierarchical RNNs on sequence and shortest dependency paths</article-title>
          <source>Bioinformatics</source>
          <year>2018</year>
          <month>03</month>
          <day>01</day>
          <volume>34</volume>
          <issue>5</issue>
          <fpage>828</fpage>
          <lpage>35</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29077847"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btx659</pub-id>
          <pub-id pub-id-type="medline">29077847</pub-id>
          <pub-id pub-id-type="pii">4565590</pub-id>
          <pub-id pub-id-type="pmcid">PMC6030919</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sahu</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Anand</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Drug-drug interaction extraction from biomedical texts using long short-term memory network</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>10</month>
          <volume>86</volume>
          <fpage>15</fpage>
          <lpage>24</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(18)30160-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.08.005</pub-id>
          <pub-id pub-id-type="medline">30142385</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30160-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vu</surname>
              <given-names>NT</given-names>
            </name>
            <name name-style="western">
              <surname>Adel</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Schütze</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Combining recurrent and convolutional neural networks for relation classification</article-title>
          <source>Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2016</year>
          <month>6</month>
          <conf-name>NAACL '16</conf-name>
          <conf-date>June 12-17, 2016</conf-date>
          <conf-loc>San Diego, California</conf-loc>
          <fpage>534</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/N16-1065/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Joint entity and relation extraction based on a hybrid neural network</article-title>
          <source>Neurocomputing</source>
          <year>2017</year>
          <month>09</month>
          <volume>257</volume>
          <fpage>59</fpage>
          <lpage>66</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2016.12.075</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pyysalo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ginter</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Moen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Salakoski</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ananiadou</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Distributional semantics resources for biomedical text processing</article-title>
          <source>Proceedings of the 5th International Symposium on Languages in Biology and Medicine</source>
          <year>2013</year>
          <conf-name>LBM '13</conf-name>
          <conf-date>December 12-13, 2013</conf-date>
          <conf-loc>Tokyo, Japan</conf-loc>
          <fpage>39</fpage>
          <lpage>44</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bio.nlplab.org/pdf/pyysalo13literature.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pasupa</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Seneewong Na Ayutthaya</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Thai sentiment analysis with deep learning techniques: a comparative study based on word embedding, POS-tag, and sentic features</article-title>
          <source>Sustain Cities Soc</source>
          <year>2019</year>
          <month>10</month>
          <volume>50</volume>
          <fpage>101615</fpage>
          <pub-id pub-id-type="doi">10.1016/j.scs.2019.101615</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cormode</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Sequence distance embeddings</article-title>
          <source>Department of Computer Science, The University of Warwick</source>
          <year>2003</year>
          <month>1</month>
          <access-date>2022-09-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.dcs.warwick.ac.uk/report/pdfs/cs-rr-393.pdf">https://www.dcs.warwick.ac.uk/report/pdfs/cs-rr-393.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bahdanau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Neural machine translation by jointly learning to align and translate</article-title>
          <source>Proceedings of the 3rd International Conference on Learning Representations</source>
          <year>2015</year>
          <conf-name>ICLR '15</conf-name>
          <conf-date>May 7-9, 2015</conf-date>
          <conf-loc>San Diego, CA, USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1409.0473"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Luong</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Pham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
          </person-group>
          <article-title>Effective approaches to attention-based neural machine translation</article-title>
          <source>Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2015</year>
          <month>9</month>
          <conf-name>EMNLP '15</conf-name>
          <conf-date>September 17-21, 2015</conf-date>
          <conf-loc>Lisbon, Portugal</conf-loc>
          <fpage>1412</fpage>
          <lpage>21</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D15-1166/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Dohan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Luong</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Norouzi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>QV</given-names>
            </name>
          </person-group>
          <article-title>QANet: combining local convolution with global self-attention for reading comprehension</article-title>
          <source>Proceedings of the 6th International Conference on Learning Representations</source>
          <year>2018</year>
          <conf-name>ICLR '18</conf-name>
          <conf-date>April 30-May 3, 2018</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/forum?id=B14TlG-RW"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Attention-based LSTM for aspect-level sentiment classification</article-title>
          <source>Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2016</year>
          <conf-name>EMNLP '16</conf-name>
          <conf-date>November 1-5, 2016</conf-date>
          <conf-loc>Austin, TX, USA</conf-loc>
          <fpage>606</fpage>
          <lpage>15</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D16-1058.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joens</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>Ł</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <source>Proceedings of the 2017 Advances in Neural Information Processing Systems</source>
          <year>2017</year>
          <conf-name>NeurIPS '17</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <fpage>5998</fpage>
          <lpage>6008</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mehryary</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Björne</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pyysalo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Salakoski</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ginter</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Deep learning with minimal training data: TurkuNLP entry in the BioNLP shared task 2016</article-title>
          <source>Proceedings of the 4th BioNLP Shared Task Workshop</source>
          <year>2016</year>
          <conf-name>BioNLP '16</conf-name>
          <conf-date>August 13, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>73</fpage>
          <lpage>81</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W16-3009.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/W16-3009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Quan</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A shortest dependency path based convolutional neural network for protein-protein relation extraction</article-title>
          <source>Biomed Res Int</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>8479587</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1155/2016/8479587"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2016/8479587</pub-id>
          <pub-id pub-id-type="medline">27493967</pub-id>
          <pub-id pub-id-type="pmcid">PMC4963603</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jettakul</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wichadakul</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Vateekul</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Relation extraction between bacteria and biotopes from biomedical texts with attention mechanisms and domain-specific contextual representations</article-title>
          <source>BMC Bioinformatics</source>
          <year>2019</year>
          <month>12</month>
          <day>03</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>627</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3217-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-019-3217-3</pub-id>
          <pub-id pub-id-type="medline">31795930</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-019-3217-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC6889521</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deléger</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bossy</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chaix</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ferré</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bessières</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Nédellec</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Overview of the bacteria biotope task at BioNLP shared task 2016</article-title>
          <source>Proceedings of the 4th BioNLP Shared Task Workshop</source>
          <year>2016</year>
          <conf-name>BioNLP '16</conf-name>
          <conf-date>August 13, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>12</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w16-3002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bossy</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Deléger</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chaix</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nédellec</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Bacteria biotope at BioNLP open shared tasks 2019</article-title>
          <source>Proceedings of The 5th Workshop on BioNLP Open Shared Tasks</source>
          <year>2019</year>
          <conf-name>BioNLP '19</conf-name>
          <conf-date>November 4, 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>121</fpage>
          <lpage>31</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/D19-5719</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Biomedical event extraction based on GRU integrating attention mechanism</article-title>
          <source>BMC Bioinformatics</source>
          <year>2018</year>
          <month>08</month>
          <day>13</day>
          <volume>19</volume>
          <issue>Suppl 9</issue>
          <fpage>285</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2275-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-018-2275-2</pub-id>
          <pub-id pub-id-type="medline">30367569</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-018-2275-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC6101075</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Bacteria biotope relation extraction via lexical chains and dependency graphs</article-title>
          <source>Proceedings of The 5th Workshop on BioNLP Open Shared Tasks</source>
          <year>2019</year>
          <conf-name>BioNLP '19</conf-name>
          <conf-date>November 4, 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>158</fpage>
          <lpage>67</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/D19-5723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schuster</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paliwal</surname>
              <given-names>KK</given-names>
            </name>
          </person-group>
          <article-title>Bidirectional recurrent neural networks</article-title>
          <source>IEEE Trans Signal Process</source>
          <year>1997</year>
          <month>11</month>
          <volume>45</volume>
          <issue>11</issue>
          <fpage>2673</fpage>
          <lpage>81</lpage>
          <pub-id pub-id-type="doi">10.1109/78.650093</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A multi-task learning framework for extracting bacteria biotope information</article-title>
          <source>Proceedings of The 5th Workshop on BioNLP Open Shared Tasks</source>
          <year>2019</year>
          <conf-name>BioNLP '19</conf-name>
          <conf-date>November 4, 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>105</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D19-5716/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/D19-5716</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chaix</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Dubreucq</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Fatihi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Valsamou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bossy</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Deléger</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zweigenbaum</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bessières</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lepiniec</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nédellec</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Overview of the regulatory network of plant seed development (SeeDev) task at the BioNLP shared task 2016</article-title>
          <source>Proceedings of the 4th BioNLP Shared Task Workshop</source>
          <year>2016</year>
          <conf-name>BioNLP '16</conf-name>
          <conf-date>August 13, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>1</fpage>
          <lpage>11</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/W16-3001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Panyam</surname>
              <given-names>NC</given-names>
            </name>
            <name name-style="western">
              <surname>Khirbat</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Verspoor</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cohn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ramamohanarao</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>SeeDev binary event extraction using SVMs and a rich feature set</article-title>
          <source>Proceedings of the 4th BioNLP Shared Task Workshop</source>
          <year>2016</year>
          <conf-name>BioNLP '16</conf-name>
          <conf-date>August 13, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>82</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/W16-3010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yaseen</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Schütze</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Linguistically informed relation extraction and neural architectures for nested named entity recognition in BioNLP-OST 2019</article-title>
          <source>Proceedings of The 5th Workshop on BioNLP Open Shared Tasks</source>
          <year>2019</year>
          <conf-name>BioNLP '19</conf-name>
          <conf-date>November 4, 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>132</fpage>
          <lpage>42</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/D19-5720</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Ohta</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tateisi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tsujii</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>GENIA corpus--semantically annotated corpus for bio-textmining</article-title>
          <source>Bioinformatics</source>
          <year>2003</year>
          <volume>19 Suppl 1</volume>
          <fpage>i180</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btg1023</pub-id>
          <pub-id pub-id-type="medline">12855455</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Charniak</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Coarse-to-fine n-best parsing and MaxEnt discriminative reranking</article-title>
          <source>Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics</source>
          <year>2005</year>
          <conf-name>ACL '05</conf-name>
          <conf-date>June 25-30, 2005</conf-date>
          <conf-loc>Ann Arbor, MI, USA</conf-loc>
          <fpage>173</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/P05-1022.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1219840.1219862</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McClosky</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <source>Any Domain Parsing: Automatic Domain Adaptation for Natural Language Parsing</source>
          <year>2010</year>
          <publisher-loc>Providence, RI, USA</publisher-loc>
          <publisher-name>Brown University</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Marneffe</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>MacCartney</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
          </person-group>
          <article-title>Generating typed dependency parses from phrase structure parses</article-title>
          <source>Proceedings of the Fifth International Conference on Language Resources and Evaluation</source>
          <year>2006</year>
          <conf-name>LRE '06</conf-name>
          <conf-date>May 22-28, 2006</conf-date>
          <conf-loc>Genoa, Italy</conf-loc>
          <fpage>449</fpage>
          <lpage>54</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.lrec-conf.org/proceedings/lrec2006/pdf/440_pdf.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sheng</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Exploiting entity BIO tag embeddings and multi-task learning for relation extraction with imbalanced data</article-title>
          <source>Proceedings of the 57th Conference of the Association for Computational Linguistics</source>
          <year>2019</year>
          <conf-name>ACL '19</conf-name>
          <conf-date>July 28- August 2, 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <fpage>1351</fpage>
          <lpage>60</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/P19-1130/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2</source>
          <year>2013</year>
          <conf-name>NIPS '13</conf-name>
          <conf-date>December 5-10, 2013</conf-date>
          <conf-loc>Lake Tahoe, NV, USA</conf-loc>
          <fpage>3111</fpage>
          <lpage>9</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>An attention-based effective neural model for drug-drug interactions extraction</article-title>
          <source>BMC Bioinformatics</source>
          <year>2017</year>
          <month>10</month>
          <day>10</day>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>445</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-017-1855-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-017-1855-x</pub-id>
          <pub-id pub-id-type="medline">29017459</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-017-1855-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC5634850</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Komandur Elayavilli</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chaudhary</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Extracting chemical-protein relations using attention-based neural networks</article-title>
          <source>Database (Oxford)</source>
          <year>2018</year>
          <month>01</month>
          <day>01</day>
          <volume>2018</volume>
          <fpage>bay102</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bay102"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bay102</pub-id>
          <pub-id pub-id-type="medline">30295724</pub-id>
          <pub-id pub-id-type="pii">5122756</pub-id>
          <pub-id pub-id-type="pmcid">PMC6174551</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bunescu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mooney</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>A shortest path dependency kernel for relation extraction</article-title>
          <source>Proceedings of Human Language Technology Conference and Conference on Empirical Methods in Natural Language Processing</source>
          <year>2005</year>
          <conf-name>EMNLP '05</conf-name>
          <conf-date>October 6-8, 2005</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>724</fpage>
          <lpage>31</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/H05-1091.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chowdhury</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>Lavelli</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Moschitti</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A study on dependency tree kernels for automatic extraction of protein-protein interaction</article-title>
          <source>Proceedings of BioNLP 2011 Workshop</source>
          <year>2011</year>
          <conf-name>BioNLP '11</conf-name>
          <conf-date>June 23-24, 2011</conf-date>
          <conf-loc>Portland, OR, USA</conf-loc>
          <fpage>124</fpage>
          <lpage>33</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W11-0216.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Classifying relations via long short term memory networks along shortest dependency paths</article-title>
          <source>Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2015</year>
          <conf-name>EMNLP '15</conf-name>
          <conf-date>September 17-21, 2015</conf-date>
          <conf-loc>Lisbon, Portugal</conf-loc>
          <fpage>1785</fpage>
          <lpage>94</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D15-1206.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Convolutional neural networks for sentence classification</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2014</year>
          <month>10</month>
          <conf-name>EMNLP '16</conf-name>
          <conf-date>October 25-29, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1746</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D14-1181/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Srivastava</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Krizhevsky</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Salakhutdinov</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Dropout: a simple way to prevent neural networks from overfitting</article-title>
          <source>J Mach Learn Res</source>
          <year>2014</year>
          <volume>15</volume>
          <issue>56</issue>
          <fpage>1929</fpage>
          <lpage>58</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Adam: a method for stochastic optimization</article-title>
          <source>Proceedings of the 3rd International Conference on Learning Representations</source>
          <year>2014</year>
          <month>12</month>
          <day>22</day>
          <conf-name>ICLR '15</conf-name>
          <conf-date>May 7-9, 2015</conf-date>
          <conf-loc>San Diego, CA, USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1412.6980"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paszke</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gross</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Massa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lerer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bradbury</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chanan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Killeen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gimelshein</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Antiga</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Desmaison</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Köpf</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>DeVito</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Raison</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tejani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chilamkurthy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Steiner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chintala</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>PyTorch: an imperative style, high-performance deep learning library</article-title>
          <source>Proceedings of the 33rd International Conference on Neural Information Processing Systems</source>
          <year>2019</year>
          <conf-name>NeurIPS '19</conf-name>
          <conf-date>December 8-14, 2019</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>8024</fpage>
          <lpage>35</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper/2019/file/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="web">
          <article-title>BioNLP-OST 2019 Evaluation Service</article-title>
          <source>Institut National de la Recherche Agronomique</source>
          <year>2019</year>
          <access-date>2022-06-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://bibliome.jouy.inra.fr/demo/BioNLP-OST-2019-Evaluation/index.html">http://bibliome.jouy.inra.fr/demo/BioNLP-OST-2019-Evaluation/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
