<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i8e37817</article-id>
      <article-id pub-id-type="pmid">35917162</article-id>
      <article-id pub-id-type="doi">10.2196/37817</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>A Syntactic Information–Based Classification Model for Medical Literature: Algorithm Development and Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Hao</surname>
            <given-names>Tianyong</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gao</surname>
            <given-names>Jianliang</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Du</surname>
            <given-names>Yongping</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Tang</surname>
            <given-names>Wentai</given-names>
          </name>
          <degrees>BCS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7594-5031</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Jian</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>College of Computer Science and Technology</institution>
            <institution>Dalian University of Technology</institution>
            <addr-line>No 2 Linggong Road</addr-line>
            <addr-line>Ganjingzi District</addr-line>
            <addr-line>Dalian, 116023</addr-line>
            <country>China</country>
            <phone>86 13604119266</phone>
            <email>wangjian@dlut.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4656-7446</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Hongfei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0872-7688</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Zhao</surname>
            <given-names>Di</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0876-5126</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Xu</surname>
            <given-names>Bo</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5453-978X</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Yijia</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5843-4675</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Zhihao</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6186-2024</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>College of Computer Science and Technology</institution>
        <institution>Dalian University of Technology</institution>
        <addr-line>Dalian</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Jian Wang <email>wangjian@dlut.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>8</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>2</day>
        <month>8</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>8</issue>
      <elocation-id>e37817</elocation-id>
      <history>
        <date date-type="received">
          <day>10</day>
          <month>3</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>28</day>
          <month>5</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>1</day>
          <month>6</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>27</day>
          <month>6</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Wentai Tang, Jian Wang, Hongfei Lin, Di Zhao, Bo Xu, Yijia Zhang, Zhihao Yang. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 02.08.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/8/e37817" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The ever-increasing volume of medical literature necessitates the classification of medical literature. Medical relation extraction is a typical method of classifying a large volume of medical literature. With the development of arithmetic power, medical relation extraction models have evolved from rule-based models to neural network models. The single neural network model discards the shallow syntactic information while discarding the traditional rules. Therefore, we propose a syntactic information–based classification model that complements and equalizes syntactic information to enhance the model.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We aim to complete a syntactic information–based relation extraction model for more efficient medical literature classification.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We devised 2 methods for enhancing syntactic information in the model. First, we introduced shallow syntactic information into the convolutional neural network to enhance nonlocal syntactic interactions. Second, we devise a cross-domain pruning method to equalize local and nonlocal syntactic interactions.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We experimented with 3 data sets related to the classification of medical literature. The F1 values were 65.5% and 91.5% on the BioCreative ViCPR (CPR) and Phenotype-Gene Relationship data sets, respectively, and the accuracy was 88.7% on the PubMed data set. Our model outperforms the current state-of-the-art baseline model in the experiments.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our model based on syntactic information effectively enhances medical relation extraction. Furthermore, the results of the experiments show that shallow syntactic information helps obtain nonlocal interaction in sentences and effectively reinforces syntactic features. It also provides new ideas for future research directions.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>medical relation extraction</kwd>
        <kwd>syntactic features</kwd>
        <kwd>pruning method</kwd>
        <kwd>neural networks</kwd>
        <kwd>medical literature</kwd>
        <kwd>medical text</kwd>
        <kwd>extraction</kwd>
        <kwd>syntactic</kwd>
        <kwd>classification</kwd>
        <kwd>interaction</kwd>
        <kwd>text</kwd>
        <kwd>literature</kwd>
        <kwd>semantic</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The classification of medical literature is especially necessary in light of the ever-increasing volume of material. Medical relation extraction is a typical method for classifying medical literature, which classifies the literature quickly by using medical texts. The advancement of this technology will have a profound impact on medical research. For example, in the sentence, “The catalytic structural domain of human phenylalanine hydroxylase binds to a catechol inhibitor,” from the medical literature (<xref rid="figure1" ref-type="fig">Figure 1</xref>), there is a “down-regulated” relation (CPR:4). We can input the text into the model to obtain the relation category as “CPR:4” in the CPR data set. Thus, we can quickly classify medical literature.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Interaction features by introducing shallow syntactic information and equalization. (A) Dependency tree without processing; (B) dependency tree after syntactic structure fusion; and (C) dependency tree after the pruning process. The weight of each arc in the forest is indicated by its number. Some edges were omitted for the sake of clarity.</p>
        </caption>
        <graphic xlink:href="medinform_v10i8e37817_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>There are 2 primary approaches for extracting medical relations: network-based and rule-based approaches. Rule-based models only obtain shallow syntactic information by imposing rule constraints, leading to early studies that focus on obtaining shallow syntactic information, such as part-of-speech tags [<xref ref-type="bibr" rid="ref1">1</xref>] or a complete structure [<xref ref-type="bibr" rid="ref2">2</xref>]. In contrast, the neural network–based model focuses on syntactic dependency features but leaves out shallow syntactic information. Now, large-scale neural network models have significantly outperformed rule-based models with the resurgence of neural network approaches [<xref ref-type="bibr" rid="ref3">3</xref>]. As a result, researchers no longer value shallow syntactic information, and medical relation extraction is gradually adopting a neural network approach. Early efforts leverage graph long short-term memory (LSTM) [<xref ref-type="bibr" rid="ref4">4</xref>] or graph neural networks [<xref ref-type="bibr" rid="ref5">5</xref>] to encode the 1-best dependency tree in the medical relation extraction. Zhang et al [<xref ref-type="bibr" rid="ref6">6</xref>] analyzed sentence interaction information using a graph convolutional network (GCN) model [<xref ref-type="bibr" rid="ref7">7</xref>]. Song et al [<xref ref-type="bibr" rid="ref8">8</xref>] constructed a dependency forest, and Jin et al [<xref ref-type="bibr" rid="ref9">9</xref>] concurrently trained a relation extraction model and a pretrained dependency parser [<xref ref-type="bibr" rid="ref10">10</xref>] to mitigate error propagation when incorporating the dependency structure.</p>
      <p>In medical relation extraction, both rule-based and neural network–based models have drawbacks. First, the rule-based approach is too costly to design rules for medical texts. Because the customization of medical text rules is different from the general-purpose domain [<xref ref-type="bibr" rid="ref11">11</xref>], it relies more on expert knowledge. Second, the neural network–based approach has difficulty in capturing sufficient syntactic features [<xref ref-type="bibr" rid="ref12">12</xref>], as shallow syntactic information is discarded. As a result, we designed a soft-rule neural network model that allows the encoding phase of the neural network model to carry shallow syntactic features, overcoming the problem of insufficient syntactic features after the neural network discards the rules.</p>
      <p>Our model can better capture the interaction features in sentences by introducing shallow syntactic information and equalization. As we can see, <xref rid="figure1" ref-type="fig">Figure 1</xref> shows the unprocessed sentence (<xref rid="figure1" ref-type="fig">Figure 1</xref>A). With the addition of shallow syntactic information to the model, it becomes the sentence shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>B with the addition of hydroxylase and inhibitor interactions. When the model is equalized, <xref rid="figure1" ref-type="fig">Figure 1</xref>B transforms into <xref rid="figure1" ref-type="fig">Figure 1</xref>C, with a more evenly distributed score of weight interactions within sentences.</p>
      <p>Overall, we propose a syntactic feature–based relation extraction model for medical literature classification, where shallow syntactic information is incorporated and equalized in a neural network. First, our model's encoder is the ordered neuron LSTM (ON-LSTM) [<xref ref-type="bibr" rid="ref13">13</xref>]. When encoded, it captures the syntactic structure in the shallow syntactic information [<xref ref-type="bibr" rid="ref13">13</xref>]. Second, we design a pruning process on the attention matrix to balance the weight of sentence interactions.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Settings</title>
        <sec>
          <title>Overview</title>
          <p>We chose 3 data sets from the medical field to evaluate our model. Using the data sets, we experimented with 2 types of medical relation extraction tasks at the cross-sentence and sentence levels.</p>
        </sec>
        <sec>
          <title>Extraction of Cross-sentence Relations</title>
          <p>For extracting cross-sentence relations, 6086 binary relation instances were extracted from PubMed [<xref ref-type="bibr" rid="ref4">4</xref>] and 6986 ternary relation instances were noted in the data sets. This yielded 2 data sets for more detailed evaluation [<xref ref-type="bibr" rid="ref14">14</xref>]: one contains 5 categories of relational labels and the other groups all labels that are not “None” into one category.</p>
          <p>For extracting sentence-level relation. We referred to the BioCreative ViCPR (CPR) and Phenotype-Gene Relationship (PGR) data sets. The PGR data set introduces the information between human genes with human phenotypes; it contains 218 test instances and 11,781 training instances and 2 types of relation labels: “No” and “Yes.” The CPR data set contains information about the interactions between human proteins and chemical components. It has 16,106 training, 14,268 testing, and 10,031 development instances, as well as containing 5 relations such as “None,” “CPR:2,” and “CPR:6” relation. We combined these 2 data sets into 1 table to make it more intuitive.</p>
        </sec>
        <sec>
          <title>Experimental Parameter Setting</title>
          <p>For the cross-sentence relation task, we referred to the same data divides that Guo et al [<xref ref-type="bibr" rid="ref14">14</xref>] used. The hidden size of ON-LSTM is set to 300 in our stochastic gradient descent optimizer with a 300-dimensional Glove and 0.9 decay rate and reports the average test accuracy over 5 cross-validation folds. For the sentence-level task, the F1 results are shown [<xref ref-type="bibr" rid="ref8">8</xref>], and we randomly divided 10% of the PGR training set as the development set to ensure consistent data division. We fine-tuned the hyperparameters based on the outcomes of the development sets. The results marked with an asterisk are based on a reimplementation of the original model. The aforementioned configuration ensures that our model has a consistent data partitioning and operating environment with the baseline.</p>
        </sec>
      </sec>
      <sec>
        <title>The Overall Architecture</title>
        <p>An overview of our proposed syntactic enhancement graph convolutional network (SEGCN) model (<xref rid="figure2" ref-type="fig">Figure 2</xref>) consists of 3 parts: an Encoder, a Feature Processor, and a classifier. The Encoder incorporates the syntactic structural features, and the Feature Processor handles the features containing structural information.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Diagrammatic representation of the syntactic enhancement graph convolutional network model showing an instance and its syntactic information processing flow. The syntactic structure tree can be obtained from the encoder, and a matrix-tree can transform the syntactic dependency tree in the feature processor.</p>
          </caption>
          <graphic xlink:href="medinform_v10i8e37817_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Encoder</title>
        <p>We used ON-LSTM [<xref ref-type="bibr" rid="ref13">13</xref>] to obtain a syntactic structure in shallow syntactic information. The ON-LSTM introduces syntactic structure information while encoding by layering the neurons. In terms of the overall framework, it is similar to LSTM. Here, we mathematically illustrate how ON-LSTM incorporates syntactic structural features.</p>
        <p>Given a sentence <italic>s</italic> = <italic>x</italic><sub>1</sub>,…,<italic>x</italic><sub>n</sub>, where <italic>x</italic><sub>i</sub> represents the <italic>i</italic>-th word. We have written <bold>h</bold> = <bold>h</bold><sub>1</sub>,…,<bold>h</bold><sub>n</sub> for the structural output of the sentence <bold>h</bold> <inline-graphic xlink:href="medinform_v10i8e37817_fig15.png" xlink:type="simple" mimetype="image"/> R<sup>n×</sup><italic><sup>d</sup></italic>, where <bold>h</bold><italic><sub>i</sub></italic> <inline-graphic xlink:href="medinform_v10i8e37817_fig15.png" xlink:type="simple" mimetype="image"/> R<italic><sup>d</sup></italic> denotes the <italic>i</italic>-th word’s hidden state with a <italic>d</italic> dimension. A cell <italic>c</italic><sub>t</sub> is used to record the state of <bold>h</bold><italic><sub>t</sub></italic>; to control <bold>h</bold><italic><sub>t</sub></italic>, which is the data flow between the inputs and outputs, a forget gate <italic>f<sub>t</sub></italic>, an output gate <italic>o</italic><sub>t</sub> and an input gate <italic>i<sub>t</sub></italic> are employed. Where <bold>W</bold><italic><sub>x</sub></italic>, <bold>U</bold><italic><sub>x</sub></italic>, and <italic>b<sub>x</sub></italic>(x <inline-graphic xlink:href="medinform_v10i8e37817_fig15.png" xlink:type="simple" mimetype="image"/> f, I, o, c) are model parameters, and <italic>c</italic><sub>0</sub> is a zero-filled vector:</p>
        <disp-formula><italic>f<sub>t</sub></italic> = <italic>σ</italic>(<italic>W<sub>f</sub>x<sub>t</sub></italic> + <italic>U<sub>f</sub>h<sub>t</sub></italic><sub>–1</sub> + <italic>b<sub>f</sub></italic>) <bold>(1)</bold></disp-formula>
        <disp-formula><italic>i<sub>t</sub></italic> = <italic>σ</italic>(<italic>W<sub>i</sub>x<sub>t</sub></italic> + <italic>U<sub>i</sub>h<sub>t</sub></italic><sub>–1</sub> + <italic>b<sub>i</sub></italic>) <bold>(2)</bold></disp-formula>
        <disp-formula><italic>o<sub>t</sub></italic> = <italic>σ</italic>(<italic>W<sub>o</sub>x<sub>t</sub></italic> + <italic>U<sub>o</sub>h<sub>t</sub></italic><sub>–1</sub> + <italic>b<sub>o</sub></italic>) <bold>(3)</bold></disp-formula>
        <disp-formula><italic>c<sub>t</sub></italic> = tanh(<italic>W<sub>c</sub>x<sub>t</sub></italic> + <italic>U<sub>c</sub>h<sub>t</sub></italic><sub>–1</sub> + <italic>b<sub>c</sub></italic>) <bold>(4)</bold></disp-formula>
        <disp-formula><italic>h<sub>t</sub></italic> = <italic>o<sub>t</sub></italic> • tanh(<italic>c<sub>t</sub></italic>) <bold>(5)</bold></disp-formula>
        <p>It differs from the LSTM in that it uses a new function to replace the update function of the cell state <italic>c<sub>t</sub></italic>. Specific ordering of internal neurons by replacing the update function, allowing the syntactic structure to be integrated into the LSTM. The update rules are as follows.</p>
        <disp-formula>
          <inline-graphic xlink:href="medinform_v10i8e37817_fig5.png" xlink:type="simple" mimetype="image"/>
          <bold>(6)</bold>
        </disp-formula>
        <disp-formula>
          <inline-graphic xlink:href="medinform_v10i8e37817_fig6.png" xlink:type="simple" mimetype="image"/>
          <bold>(7)</bold>
        </disp-formula>
        <disp-formula>
          <inline-graphic xlink:href="medinform_v10i8e37817_fig7.png" xlink:type="simple" mimetype="image"/>
          <bold>(8)</bold>
        </disp-formula>
        <p>We used softmax to predict the layer order of neurons and then calculate the cumulative sum by cs. Finally, <italic>f᷉<sub>t</sub></italic> and <italic>i᷉<sub>t</sub></italic> contains the layer order information of <italic>c<sub>t</sub></italic><sub>–1</sub> and <italic>c<sub>t</sub></italic>, respectively, and the intersection of the two is <italic>ω<sub>t</sub></italic>. The cumulative sum equation is as follows.</p>
        <disp-formula>
          <inline-graphic xlink:href="medinform_v10i8e37817_fig8.png" xlink:type="simple" mimetype="image"/>
        </disp-formula>
        <disp-formula>
          <inline-graphic xlink:href="medinform_v10i8e37817_fig14.png" xlink:type="simple" mimetype="image"/>
          <bold>(9)</bold>
        </disp-formula>
        <disp-formula>
          <inline-graphic xlink:href="medinform_v10i8e37817_fig9.png" xlink:type="simple" mimetype="image"/>
          <bold>(10)</bold>
        </disp-formula>
        <p>Following the cumulative sum’s properties, the master forget gate <italic>f᷉<sub>t</sub></italic> has values that change from 0 to 1, while the master input gate <italic>i᷉<sub>t</sub></italic> has values that decrease monotonically from 1 to 0. The overlap of <italic>f᷉<sub>t</sub></italic> and <italic>i᷉<sub>t</sub></italic> is represented by the product of the two master gates <italic>ω<sub>t</sub></italic>.</p>
        <disp-formula><italic>C</italic> = <italic>ω<sub>t</sub></italic> • (<italic>f<sub>t</sub></italic> • <italic>c<sub>t</sub></italic><sub>–1</sub> + <italic>i<sub>t</sub></italic> • <italic>c<sub>t</sub></italic>) + (<italic>f᷉<sub>t</sub></italic> – <italic>ω<sub>t</sub></italic>) • <italic>c<sub>t</sub></italic><sub>–1</sub> + (<italic>i᷉<sub>t</sub></italic> – <italic>ω<sub>t</sub></italic>) • <italic>c</italic><sub>t</sub> <bold>(11)</bold></disp-formula>
        <p>Finally, the cell state <italic>C</italic> is segmented by layer order information, and the fused syntactic structure is fused in the model.</p>
      </sec>
      <sec>
        <title>Feature Processor</title>
        <sec>
          <title>Multi-Head Attention</title>
          <p>By building an attention adjacency matrix <bold><italic>S</italic></bold><italic><sup>k</sup></italic>, we converted the feature <bold>h</bold> to a fully connected weight graph. A set of key-value pairs and a query were used in the calculation. The obtained attention matrices represent the potential syntactic tree, which is computed from the function of the keyword <bold>K</bold> with the corresponding query <bold>Q</bold>. In this case, both <bold>Q</bold> and <bold>K</bold> are the same as <bold>h</bold>.</p>
          <disp-formula>
            <inline-graphic xlink:href="medinform_v10i8e37817_fig10.png" xlink:type="simple" mimetype="image"/>
            <bold>(12)</bold>
          </disp-formula>
          <p>Where <bold>W</bold><italic><sup>Q</sup></italic>  <inline-graphic xlink:href="medinform_v10i8e37817_fig15.png" xlink:type="simple" mimetype="image"/> R<italic><sup>d</sup></italic><sup>×</sup><italic><sup>d</sup></italic> and <bold>W</bold><italic><sup>K</sup></italic>  <inline-graphic xlink:href="medinform_v10i8e37817_fig15.png" xlink:type="simple" mimetype="image"/> R<italic><sup>d</sup></italic><sup>×</sup><italic><sup>d</sup></italic> are parameters for projections, <italic>d</italic> denotes the vector dimension. <bold>S</bold><italic><sup>k</sup></italic> consists of <inline-graphic xlink:href="medinform_v10i8e37817_fig11.png" xlink:type="simple" mimetype="image"/>. <bold>h</bold><italic><sub>i</sub></italic> and <bold>h</bold><italic><sub>j</sub></italic> represent the normalized weight scores of the <italic>i</italic>-th and the <italic>j</italic>-th token, respectively.</p>
        </sec>
        <sec>
          <title>Matrix-Tree Pruning</title>
          <p>We pruned the matrix-tree <bold>S</bold><italic><sup>k</sup></italic> to balance the syntactic features, output as matrix-tree A. It is achieved by multiplying a Gaussian kernel with an attention matrix. In the field of image processing, Gaussian kernel functions are commonly used to equalize images. In the model, we chose a 2-dimensional Gaussian kernel to balance the syntactic features. The following is the Gaussian kernel function.</p>
          <disp-formula>
            <inline-graphic xlink:href="medinform_v10i8e37817_fig12.png" xlink:type="simple" mimetype="image"/>
            <bold>(13)</bold>
          </disp-formula>
          <p>where <italic>a</italic> is the amplitude, <italic>x<sub>o</sub></italic> and <italic>y<sub>o</sub></italic> are the coordinates of the center point, and <italic>σ<sub>x</sub></italic> and <italic>σ<sub>y</sub></italic> are the variance. With the aforementioned 2-dimensional Gaussian kernel function, we could obtain the Gaussian kernel.</p>
        </sec>
        <sec>
          <title>GCN</title>
          <p>GCN is a neural network that can use information about the graph's structure. On the input of the GCN, we replaced the graph structure of the input with the syntactic tree matrix A generated above, and the feature vector is the output vector <bold>h</bold> of the Encoder. The layer-wise propagation rules of GCN are as follows:</p>
          <disp-formula>
            <inline-graphic xlink:href="medinform_v10i8e37817_fig13.png" xlink:type="simple" mimetype="image"/>
            <bold>(14)</bold>
          </disp-formula>
          <p>The adjacency matrix of an undirected graph <bold>g</bold> with extra self-connections is denoted by <bold>Ã</bold>, <bold>Ã</bold> = <bold>A</bold> + <italic>I<sub>N</sub>. I<sub>N</sub></italic> is the identity matrix, <italic>D᷉<sub>ii</sub></italic> = Σ<italic><sub>i</sub></italic> <bold>Ã</bold><sub>ij</sub>. <bold>W</bold><sup>(</sup><italic><sup>l</sup></italic><sup>)</sup> is a trainable weight matrix. The activation function is denoted by <italic>σ</italic>(•). <bold>H</bold><sup>(</sup><italic><sup>l</sup></italic><sup>)</sup>  <inline-graphic xlink:href="medinform_v10i8e37817_fig15.png" xlink:type="simple" mimetype="image"/> R<italic><sup>N</sup></italic><sup>×</sup><italic><sup>D</sup></italic> is the activation matrix in the <italic>l</italic>-th layer, <bold>H</bold><sup>(0)</sup> denotes the <bold>h</bold>.</p>
        </sec>
      </sec>
      <sec>
        <title>Classifier</title>
        <p>To obtain final categorization representations, we combined sentence and entity representations and fed them into a feedforward neural network.</p>
        <disp-formula><italic>H</italic><sub>final</sub> = <italic>FFNN</italic>([<italic>H<sub>sent</sub></italic> ; <italic>H<sub>s</sub></italic> ; <italic>H<sub>o</sub></italic>]) <bold>(15)</bold></disp-formula>
        <p><bold>H</bold><sub>sent</sub>, <bold>H</bold><italic><sub>s</sub></italic>, and <bold>H</bold><italic><sub>o</sub></italic> denote sentence, subject, and object representations, respectively. Finally, the logistic regression classifier performs predicted categorization of the outcome using <bold>H</bold><sub>final</sub> as a token.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Results of the Cross-sentence Task</title>
        <p>For the cross-sentence task, we used 3 types of models as baselines: (1) feature-based classifier [<xref ref-type="bibr" rid="ref15">15</xref>] based on all entity pairs' shortest dependency pathways; (2) graph-structured LSTM methods, including bidirectional directed acyclic graph (DAG) LSTM (Bidir DAG LSTM) [<xref ref-type="bibr" rid="ref5">5</xref>], Graph State LSTM (GS LSTM), and Graph LSTM [<xref ref-type="bibr" rid="ref4">4</xref>]—these approaches extend LSTM to encode graphs generated from dependency edges created from input phrases; and (3) pruned GCNs [<xref ref-type="bibr" rid="ref6">6</xref>] including attention-guided GCN (AGGCN) [<xref ref-type="bibr" rid="ref14">14</xref>] and Lévy Flights GCN (LFGCN) [<xref ref-type="bibr" rid="ref11">11</xref>]. These methods use GCNs to prune graphs with dependency edges. Additionally, we added the Bidirectional Encoder Representations from Transformers (BERT) pretraining model to complement the model with experiments. The results marked with an asterisk are based on a reimplementation of the original model.</p>
        <p>In the multi-class relation extraction task (last 2 columns in <xref ref-type="table" rid="table1">Table 1</xref>), our SEGCN model outperforms all baselines with accuracies of 81.7 and 80.2 on all instances (Cross). In the ternary and binary relations, our SEGCN model outperforms the best performing graph-structured LSTM model (GS LSTM) by 10.0 and 8.5 points, respectively, our model outperforms the best performing model with LFGCN by 1.8 and 2.6 points when compared to the GCN models.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>
          Results of the cross-sentence task.
        </p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="320"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="120"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td colspan="4">Binary-class, accuracy</td>
                <td colspan="2">Multi-class, accuracy</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Ternary</td>
                <td colspan="2">Binary</td>
                <td>Ternary</td>
                <td>Binary</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Single</td>
                <td>Cross</td>
                <td>Single</td>
                <td>Cross</td>
                <td>Cross</td>
                <td>Cross</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Feature-Based</td>
                <td>74.7</td>
                <td>77.7</td>
                <td>73.9</td>
                <td>75.2</td>
                <td>—<sup>a</sup></td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Graph LSTM<sup>b</sup></td>
                <td>77.9</td>
                <td>80.7</td>
                <td>75.6</td>
                <td>76.7</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>DAG<sup>c</sup> LSTM</td>
                <td>77.9</td>
                <td>80.7</td>
                <td>74.3</td>
                <td>76.5</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>GS LSTM<sup>d</sup></td>
                <td>80.3</td>
                <td>83.2</td>
                <td>83.5</td>
                <td>83.6</td>
                <td>71.7</td>
                <td>71.7</td>
              </tr>
              <tr valign="top">
                <td>GCN<sup>e</sup> + Pruned</td>
                <td>85.8</td>
                <td>85.8</td>
                <td>83.8</td>
                <td>83.7</td>
                <td>78.1</td>
                <td>73.6</td>
              </tr>
              <tr valign="top">
                <td>AGGCN<sup>f</sup></td>
                <td>87.1</td>
                <td>87.0</td>
                <td>85.2</td>
                <td>85.6</td>
                <td>80.2</td>
                <td>77.4</td>
              </tr>
              <tr valign="top">
                <td>LFGCN<sup>g</sup></td>
                <td>87.3</td>
                <td>86.5</td>
                <td>86.7</td>
                <td>85.7</td>
                <td>79.9</td>
                <td>77.6</td>
              </tr>
              <tr valign="top">
                <td>AGGCN + BERT<sup>h</sup></td>
                <td>87.2</td>
                <td>87.1</td>
                <td>86.1</td>
                <td>84.9</td>
                <td>80.5</td>
                <td>78.1</td>
              </tr>
              <tr valign="top">
                <td>LFGCN + BERT</td>
                <td>87.3</td>
                <td>86.5</td>
                <td>86.5</td>
                <td>86.7</td>
                <td>80.3</td>
                <td>78.0</td>
              </tr>
              <tr valign="top">
                <td>SEGCN<sup>i</sup></td>
                <td>88.5</td>
                <td>88.2</td>
                <td>87.2</td>
                <td>87.5</td>
                <td>81.7</td>
                <td>80.2</td>
              </tr>
              <tr valign="top">
                <td>SEGCN + BERT</td>
                <td>88.7</td>
                <td>88.4</td>
                <td>86.8</td>
                <td>87.7</td>
                <td>81.9</td>
                <td>80.4</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Not determined.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>DAG: directed acyclic graph.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>GS LSTM: graph-structured long short-term memory.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>GCN: graph convolutional network.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>AGGCN: attention-guided graph convolutional network.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>LFGCN: Lévy Flights graph convolutional network.</p>
            </fn>
            <fn id="table1fn8">
              <p><sup>h</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
            </fn>
            <fn id="table1fn9">
              <p><sup>i</sup>SEGCN: syntactic edge-enhanced graph convolutional network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>In the binary-class relation extraction task, our SEGCN model also outperforms all baselines (first four columns in <xref ref-type="table" rid="table1">Table 1</xref>). The task was expanded to cross-sentence– (Cross) and sentence-level (Single) subtasks. In cross-sentence–level ternary and binary classification, our model received 88.2 and 87.5 points, respectively. Our model received 88.5 and 87.2 for sentence-level ternary and binary classifications, respectively.</p>
        <p>These experiments show that our model achieves better results than previous models that discard shallow syntactic information, such as the previous GS LSTM and GCN models. We attribute the results of our models to the introduction of shallow syntactic information and the equalization process. Finally, for comparison with the latest methods, we attempted to introduce BERT pretraining. We found that the results of the task improved slightly after BERT pretraining. We believe that BERT also captured some shallow syntactic information during pretraining.</p>
      </sec>
      <sec>
        <title>Results of the Sentence-Level Task</title>
        <p>The results of the sentence-level task using the CPR [<xref ref-type="bibr" rid="ref11">11</xref>] and PGR [<xref ref-type="bibr" rid="ref16">16</xref>] data sets are shown in Table . Our model has been compared to 2 types of models: (1) sequence-based models, including the randomly initialized Dilated and Depthwise separable convolutional neural network (Random-DDCNN) [<xref ref-type="bibr" rid="ref9">9</xref>], which uses a parser that is a relational prediction model through random initialization and fine-tuning; attention-based multilayer gated recurrent unit [<xref ref-type="bibr" rid="ref17">17</xref>], which overlays attentional mechanisms on top of the recursive gated units; Bran [<xref ref-type="bibr" rid="ref18">18</xref>], which uses a bi-affine self-attention model to capture the sentence's interactions; and Bidirectional Encoder Representations from Transformers for Biomedical Text Mining [<xref ref-type="bibr" rid="ref19">19</xref>], which is a pretrained language representation model for medical literature; and (2) dependency-based models, which are based on a single dependency tree, including the biological ontology–based long short-term memory network [<xref ref-type="bibr" rid="ref20">20</xref>] and GCN. There are also dependency forest–based models, including the Edgewise–graph recurrent network (GRN) [<xref ref-type="bibr" rid="ref8">8</xref>], which prunes scores greater than a threshold; kBest-GRN [<xref ref-type="bibr" rid="ref8">8</xref>], which involves merging of k-best trees for prediction; ForestFT-DDCNN [<xref ref-type="bibr" rid="ref9">9</xref>], which constructs a learnable dependency analyzer; and AGGCN and LFGCN [<xref ref-type="bibr" rid="ref11">11</xref>], which relate multiheaded attention to dependency features.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Results of the sentence-level task.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="280"/>
            <col width="320"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Type and model</td>
                <td>Multi-class (BioCreative ViCPR data set), F1 score</td>
                <td>Binary-class (Phenotype-Gene Relationship data set), F1 score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="4">
                  <bold>Sequence-based model</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Random-DDCNN<sup>a</sup></td>
                <td>45.4</td>
                <td>—<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Att-GRU<sup>c</sup></td>
                <td>49.5</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bran</td>
                <td>50.8</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BioBERT<sup>d</sup></td>
                <td>—</td>
                <td>67.2</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Dependency-based model</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BO-LSTM<sup>e</sup></td>
                <td>—</td>
                <td>52.3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GCN<sup>f</sup></td>
                <td>52.2</td>
                <td>81.3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Edgewise-GRN<sup>g</sup></td>
                <td>53.4</td>
                <td>83.6</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>kBest-GRN</td>
                <td>52.4</td>
                <td>85.7</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ForestFT-DDCNN</td>
                <td>55.7</td>
                <td>89.3</td>
              </tr>
              <tr valign="top">
                <td colspan="2">AGGCN<sup>h</sup></td>
                <td>56.7</td>
                <td>88.5</td>
              </tr>
              <tr valign="top">
                <td colspan="2">LFGCN<sup>i</sup></td>
                <td>64.0</td>
                <td>89.6</td>
              </tr>
              <tr valign="top">
                <td colspan="2">LFGCN+BERT</td>
                <td>64.2</td>
                <td>89.8</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Our models</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SEGCN<sup>j</sup></td>
                <td>65.4</td>
                <td>91.3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SEGCN+BERT</td>
                <td>65.6</td>
                <td>91.5</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>DDCNN: Dilated and Depthwise separable convolutional neural network.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Not determined.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>Att-GRU: attention-based multilayer gated recurrent unit.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>BioBERT: Bidirectional Encoder Representations from Transformers for Biomedical Text Mining.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>BO-LSTM: biological ontology–based long short-term memory.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>GCN: graph convolutional network.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>GRN: graph recurrent network.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>AGGCN: attention-guided graph convolutional network.</p>
            </fn>
            <fn id="table2fn9">
              <p><sup>i</sup>LFGCN: Lévy Flights graph convolutional network.</p>
            </fn>
            <fn id="table2fn10">
              <p><sup>j</sup>SEGCN: syntactic enhancement graph convolutional network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>As shown in the results of the sentence-level task in <xref ref-type="table" rid="table2">Table 2</xref>, our model achieved the best performance on both the multiclass data set CPR and the dichotomous data set PGR, with F1 scores of 65.4 and 91.3. Specifically, our model outperformed the previous state-of-the-art dependency-based model (LFGCN) by 1.2 and 1.5 points on the CPR and PGR data sets, respectively. We found that the model's improvement was smaller than that on the cross-sentence level task. We argue that shallow syntactic information has a smaller impact on short sentence lengths in sentence-level tasks, and it is better suited to long sentence lengths in cross-sentence tasks.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Ablation Study</title>
        <p>We validated the different modules of our model on the PGR data set, including BERT pretraining, the matrix-tree pruning layer, and the feature capture layer. <xref ref-type="table" rid="table3">Table 3</xref> shows these results. We can see that model effectiveness decreases after removing any of the modules. All three modules can aid in the model's learning of a more accurate feature representation. The feature capture layer and the matrix-tree pruning layer improved by 2.4 and 2.5 points, respectively, indicating that the shallow syntactic information and equalization process resulted in a model boost. In contrast, the popular BERT pretraining approach was not suitable for the model.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>An ablation study using the Phenotype-Gene Relationship data set.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="760"/>
            <col width="240"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>F1 score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>SEGCN<sup>a</sup> (All)</td>
                <td>91.5</td>
              </tr>
              <tr valign="top">
                <td>SEGCN (- BERT Pretraining)</td>
                <td>91.3</td>
              </tr>
              <tr valign="top">
                <td>SEGCN (- Matrix-tree pruning)</td>
                <td>90.0</td>
              </tr>
              <tr valign="top">
                <td>SEGCN (- Feature capture)</td>
                <td>89.1</td>
              </tr>
              <tr valign="top">
                <td>Baseline (- All)</td>
                <td>88.5</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>SEGCN: syntactic enhancement graph convolutional network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The ablation experiments show that shallow syntactic information and equalization processing methods can improve model performance significantly. We believe that these two methods function by processing the interaction information in the sentences. The shallow syntactic information complements the nonlocal interaction of the sentence, and the equalization process balances the local and nonlocal interactions of the sentence.</p>
      </sec>
      <sec>
        <title>Performance Against Sentence Length</title>
        <p>We examined the effect of introducing shallow syntactic information on different sentence lengths through comparative experiments. <xref rid="figure3" ref-type="fig">Figure 3</xref>A shows the F1 scores of the 3 models at different sentence lengths. There are 3 categories based on sentence length ((0,25), [25,50),&#62;50). In general, our SEGCN outperformed ForestFT-DDCNN and LFGCN in all 3 length categories. Furthermore, the performance gap widened as the instance length increased. These results suggest that adding shallow syntactic information, particularly in long sentences, improves our model significantly. We attribute this to the fact that our model complements the nonlocal interactions of the sentences with the introduction of shallow syntactic information. Because they rely more on nonlocal interactions, longer sentences received higher F1 scores.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Performance against sentence length and Bidirectional Encoder Representations from Transformers (BERT) pretraining. (A) F1 scores at different sentence lengths. Results of the ForestFT– Dilated and Depthwise separable convolutional neural network are based on Jin et al [<xref ref-type="bibr" rid="ref10">10</xref>]. (B) F1 scores against sentence length after BERT pretraining. AGGCN: attention-guided graph convolutional network; LFGCN: Lévy Flights graph convolutional network.</p>
          </caption>
          <graphic xlink:href="medinform_v10i8e37817_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Performance Against BERT Pretraining</title>
        <p>To show the superiority of syntactic enhancement of our models, we compared the models with the addition of pretraining. After BERT pretraining, the F1 scores of the 3 models are shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>B for different sentence lengths. There are 3 categories based on sentence length ((0,25], [25,50),&#62;50). Overall, BERT pretraining showed small improvements for models of different sentence lengths. It supports our hypothesis that the neural network models acquire insufficient syntactic features. Furthermore, we found that our SEGCN without BERT still functioned better than the other models with BERT. These results indicate that our model outperforms BERT in using syntactical features.</p>
      </sec>
      <sec>
        <title>Case Study</title>
        <p>To demonstrate the impact of our approach on sentence interaction, we compared the features obtained from different model layers. <xref rid="figure4" ref-type="fig">Figure 4</xref> shows the attention weights of the example sentences at the different layers of the model. We decided to use a heat map to represent the attention weights. The color of each point represents the weight of the interactive information. The darker the color, the greater the weighting. For more intuition, we have omitted the points with smaller weights. In addition, the output of the multi-headed attention layer before and after incorporation into the shallow syntactic information is represented by matrices A and B, respectively. Matrix C represents the output of the equalization processing matrix B.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>The heat maps of an example sentence in the syntactic enhancement graph convolutional network model.</p>
          </caption>
          <graphic xlink:href="medinform_v10i8e37817_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>As shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>, the weight distribution in matrix <bold>A</bold> is more concentrated in the diagonal distribution. In contrast, matrix B and matrix C have significantly more nondiagonal weight distributions than matrix A. This supports our view that the model incorporating shallow syntactic information gradually focuses on nonlocal interactions in the sentence. Furthermore, by comparing matrices B and C, we see that equalized matrix C pays more even-handed attention to the model's weights (the more similar the color, the closer the weights). We believe that the model's performance is improved by balancing the attention to local and nonlocal interactions. These results further demonstrate how our model makes use of syntactic information for syntactic enhancement.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study is the first to propose incorporating shallow syntactic information for syntactic enhancement in medical relation extraction. In addition, we devised a new pruning method to equalize the syntactic interactions in the model. The results for the 3 medical data sets show that our method can improve and equalize syntactic interactions, significantly outperforming previous models. The ablation experiments demonstrate the effectiveness of our two proposed methods. In future, we intend to continue our research on the connection between shallow syntactic information and sentence interactions.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AGGCN</term>
          <def>
            <p>attention-guided graph convolutional network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DAG</term>
          <def>
            <p>directed acyclic graph</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">DDCNN</term>
          <def>
            <p>Dilated and Depthwise separable convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">GCN</term>
          <def>
            <p>graph convolutional network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GRN</term>
          <def>
            <p>graph recurrent network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LFGCN</term>
          <def>
            <p>Lévy Flights graph convolutional network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">ON-LSTM</term>
          <def>
            <p>ordered neuron–long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">PGR</term>
          <def>
            <p>Phenotype-Gene Relationship</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">Random-DDCNN</term>
          <def>
            <p>randomly initialized Dilated and Depthwise separable convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SEGCN</term>
          <def>
            <p>syntactic enhancement graph convolutional network</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The publication of this paper is funded by grants from the Natural Science Foundation of China (62006034 and 62072070), Natural Science Foundation of Liaoning Province (2021-BS-067), and the Fundamental Research Funds for the Central Universities [DUT21RC (3)015].</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>WT led the method application, experiment conduction, and the result analysis. DZ participated in the data extraction and preprocessing. YZ participated in the manuscript revision. HM provided theoretical guidance and the revision of this paper.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heeman</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>JF</given-names>
            </name>
          </person-group>
          <article-title>Incorporating POS Tagging Into Language Modeling</article-title>
          <year>1997</year>
          <conf-name>Fifth European Conference on Speech Communication and Technology, EUROSPEECH</conf-name>
          <conf-date>September 22-25, 1997</conf-date>
          <conf-loc>Rhodes</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cs.rochester.edu/research/cisd/pubs/1997/paper1.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>GJF</given-names>
            </name>
            <name name-style="western">
              <surname>Lloyd-Thomas</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A robust language model incorporating a substring parser and extended n-grams</article-title>
          <year>1994</year>
          <conf-name>ICASSP '94. IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name>
          <conf-date>April 19-22, 1994</conf-date>
          <conf-loc>Adelaide, SA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.1994.389281</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Merity</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Keskar</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Regularizing and optimizing LSTM language models</article-title>
          <year>2018</year>
          <conf-name>6th International Conference on Learning Representations, ICLR 2018</conf-name>
          <conf-date>April 30 - May 3, 2018</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Poon</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Quirk</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yih</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Cross-Sentence N-ary Relation Extraction with Graph LSTMs</article-title>
          <source>TACL</source>
          <year>2017</year>
          <month>12</month>
          <volume>5</volume>
          <fpage>101</fpage>
          <lpage>115</lpage>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00049</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Linfeng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yue</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhiguo</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>N-ary Relation Extraction using Graph-State LSTM</article-title>
          <year>2018</year>
          <conf-name>2018 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>October 31, 2018</conf-date>
          <conf-loc>Brussels</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d18-1246</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
          </person-group>
          <article-title>Graph Convolution over Pruned Dependency Trees Improves Relation Extraction</article-title>
          <year>2018</year>
          <conf-name>2018 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>October 31, 2018</conf-date>
          <conf-loc>Brussels</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d18-1244</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zhan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Semi-Supervised Classification of Graph Convolutional Networks with Laplacian Rank Constraints</article-title>
          <source>Neural Process Lett</source>
          <year>2021</year>
          <month>01</month>
          <day>01</day>
          <pub-id pub-id-type="doi">10.1007/s11063-020-10404-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gildea</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Leveraging Dependency Forest for Neural Medical Relation Extraction</article-title>
          <year>2019</year>
          <conf-name>2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name>
          <conf-date>November 2019</conf-date>
          <conf-loc>Hong Kong</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1020</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Relation Extraction Exploiting Full Dependency Forests</article-title>
          <year>2020</year>
          <month>04</month>
          <day>03</day>
          <conf-name>AAAI Conference on Artificial Intelligence</conf-name>
          <conf-date>February 7–12, 2020</conf-date>
          <conf-loc>New York, NY</conf-loc>
          <pub-id pub-id-type="doi">10.1609/aaai.v34i05.6313</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dozat</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CM</given-names>
            </name>
          </person-group>
          <article-title>Deep biaffine attention for neural dependency parsing</article-title>
          <year>2017</year>
          <conf-name>5th International Conference on Learning Representations, ICLR 2017</conf-name>
          <conf-date>April 24-26, 2017</conf-date>
          <conf-loc>Toulon</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhijiang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Nan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>SB</given-names>
            </name>
          </person-group>
          <article-title>Learning Latent Forests for Medical Relation Extraction</article-title>
          <year>2020</year>
          <conf-name>Twenty-Ninth International Joint Conference on Artificial Intelligence</conf-name>
          <conf-date>2020</conf-date>
          <conf-loc>Yokohama</conf-loc>
          <pub-id pub-id-type="doi">10.24963/ijcai.2020/505</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hale</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dyer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kuncoro</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Brennan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Finding syntax in human encephalography with beam search</article-title>
          <year>2018</year>
          <conf-name>56th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>July 2018</conf-date>
          <conf-loc>Melbourne, VIC</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/p18-1254</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sordoni</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Courville</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Ordered Neurons: Integrating Tree Structures into Recurrent Neural Networks</article-title>
          <source>arXiv. Preprint posted online May 8, 2019</source>
          <year>2019</year>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Attention Guided Graph Convolutional Networks for Relation Extraction</article-title>
          <year>2019</year>
          <conf-name>57th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>July 2019</conf-date>
          <conf-loc>Florence</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/p19-1024</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quirk</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Poon</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Distant Supervision for Relation Extraction beyond the Sentence Boundary</article-title>
          <year>2017</year>
          <conf-name>15th Conference of the European Chapter of the Association for Computational Linguistics</conf-name>
          <conf-date>April 2017</conf-date>
          <conf-loc>Valencia</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/e17-1110</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sousa</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lamurias</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Couto</surname>
              <given-names>FM</given-names>
            </name>
          </person-group>
          <article-title>A Silver Standard Corpus of Human Phenotype-Gene Relations</article-title>
          <year>2019</year>
          <conf-name>2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/n19-1152</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Komandur Elayavilli</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chaudhary</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Extracting chemical-protein relations using attention-based neural networks</article-title>
          <source>Database (Oxford)</source>
          <year>2018</year>
          <month>01</month>
          <day>01</day>
          <volume>2018</volume>
          <fpage>102</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bay102"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bay102</pub-id>
          <pub-id pub-id-type="medline">30295724</pub-id>
          <pub-id pub-id-type="pii">5122756</pub-id>
          <pub-id pub-id-type="pmcid">PMC6174551</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Verga</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Strubell</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>McCallum</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Simultaneously Self-Attending to All Mentions for Full-Abstract Biological Relation Extraction</article-title>
          <year>2018</year>
          <conf-name>2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 2018</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1080</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>1240</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lamurias</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sousa</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Clarke</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Couto</surname>
              <given-names>FM</given-names>
            </name>
          </person-group>
          <article-title>BO-LSTM: classifying relations via long short-term memory networks along biomedical ontologies</article-title>
          <source>BMC Bioinformatics</source>
          <year>2019</year>
          <month>01</month>
          <day>07</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>10</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2584-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-018-2584-5</pub-id>
          <pub-id pub-id-type="medline">30616557</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-018-2584-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC6323831</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
