<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i5e25304</article-id>
      <article-id pub-id-type="pmid">33970113</article-id>
      <article-id pub-id-type="doi">10.2196/25304</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Combining External Medical Knowledge for Improving Obstetric Intelligent Diagnosis: Model Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>Lishuang</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Liang</surname>
            <given-names>Huiying</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Kunli</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9402-1560</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Cai</surname>
            <given-names>Linkun</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9728-8164</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Song</surname>
            <given-names>Yu</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Information Engineering</institution>
            <institution>Zhengzhou University</institution>
            <addr-line>No 100, Science Avenue</addr-line>
            <addr-line>Zhengzhou, 450000</addr-line>
            <country>China</country>
            <phone>86 137 0084 2398</phone>
            <email>ieysong@zzu.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0771-7499</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Tao</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1873-5462</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Zhao</surname>
            <given-names>Yueshu</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1062-2546</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Information Engineering</institution>
        <institution>Zhengzhou University</institution>
        <addr-line>Zhengzhou</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>The Third Affiliated Hospital of Zhengzhou University</institution>
        <addr-line>Zhengzhou</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yu Song <email>ieysong@zzu.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>5</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>10</day>
        <month>5</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>5</issue>
      <elocation-id>e25304</elocation-id>
      <history>
        <date date-type="received">
          <day>27</day>
          <month>10</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>14</day>
          <month>1</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>11</day>
          <month>3</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>18</day>
          <month>4</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Kunli Zhang, Linkun Cai, Yu Song, Tao Liu, Yueshu Zhao. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 10.05.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2021/5/e25304" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Data-driven medical health information processing has become a new development trend in obstetrics. Electronic medical records (EMRs) are the basis of evidence-based medicine and an important information source for intelligent diagnosis. To obtain diagnostic results, doctors combine clinical experience and medical knowledge in their diagnosis process. External medical knowledge provides strong support for diagnosis. Therefore, it is worth studying how to make full use of EMRs and medical knowledge in intelligent diagnosis.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to improve the performance of intelligent diagnosis in EMRs by combining medical knowledge.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>As an EMR usually contains multiple types of diagnostic results, the intelligent diagnosis can be treated as a multilabel classification task. We propose a novel neural network knowledge-aware hierarchical diagnosis model (KHDM) in which Chinese obstetric EMRs and external medical knowledge can be synchronously and effectively used for intelligent diagnostics. In KHDM, EMRs and external knowledge documents are integrated by the attention mechanism contained in the hierarchical deep learning framework. In this way, we enrich the language model with curated knowledge documents, combining the advantages of both to make a knowledge-aware diagnosis.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We evaluate our model on a real-world Chinese obstetric EMR dataset and showed that KHDM achieves an accuracy of 0.8929, which exceeds that of the most advanced classification benchmark methods. We also verified the model’s interpretability advantage.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>In this paper, an improved model combining medical knowledge and an attention mechanism is proposed, based on the problem of diversity of diagnostic results in Chinese EMRs. KHDM can effectively integrate domain knowledge to greatly improve the accuracy of diagnosis.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>intelligent diagnosis</kwd>
        <kwd>obstetric electronic medical record</kwd>
        <kwd>medical knowledge</kwd>
        <kwd>attention mechanism</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Intelligent diagnosis is a way to provide clinical decision support for doctors by means of artificial intelligence technology. In the clinic, intelligent diagnosis plays an important role and can be applied to a variety of practical situations. Intelligent diagnosis can help doctors diagnose a patient’s condition, significantly improving the efficiency and accuracy of the diagnosis, and the results can also become an important basis for future diagnosis. The continuous development of modern diagnosis and treatment technology has made medical information increasingly complex. Doctors obtain a large amount of clinical diagnostic information every day and need to make comprehensive decisions based on a large amount of data representing clinical information [<xref ref-type="bibr" rid="ref1">1</xref>]. In addition, the occurrence of complications during pregnancy poses a challenge to doctors.</p>
      <p>Electronic medical records (EMRs) are the most detailed and direct form of clinical medical activities [<xref ref-type="bibr" rid="ref2">2</xref>]. With the rapid growth of EMRs, many methods of intelligent diagnosis using EMRs have become available, enabling significant progress in this field. Early intelligent diagnosis works mainly relied on artificially designed feature templates [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>] or used single traditional machine learning methods, treating intelligent diagnosis as a classification problem. Goldstein et al [<xref ref-type="bibr" rid="ref5">5</xref>] used the Informatics for Integrating Biology &#38; the Bedside 2008 dataset to train a classifier for each disease category to classify obesity and 15 other complications. Medhekar et al [<xref ref-type="bibr" rid="ref6">6</xref>] developed a decision support system based on data mining that used a naïve Bayes classifier to model heart disease. Roopa et al [<xref ref-type="bibr" rid="ref7">7</xref>] used principal component analysis to extract the characteristics of a diabetes dataset and then used a linear regression model to predict whether a patient had diabetes. These methods promoted the application of machine learning and natural language processing in intelligent diagnosis but are still in the early stages (eg, using relatively simple classification methods and a shallow analysis of the EMRs).</p>
      <p>Recently, an increasing number of researchers have focused on neural networks to model intelligent diagnosis and related tasks. Yang et al [<xref ref-type="bibr" rid="ref8">8</xref>] proposed a clinical assistant diagnosis method based on a multilayer convolutional neural network [<xref ref-type="bibr" rid="ref9">9</xref>]. This method uses self-learning to automatically extract the high-level semantic information from EMRs. Chen et al [<xref ref-type="bibr" rid="ref10">10</xref>] used an end-to-end hierarchical neural network to investigate breast cancer problems using EMRs. Hao et al [<xref ref-type="bibr" rid="ref11">11</xref>] used a deep belief network [<xref ref-type="bibr" rid="ref12">12</xref>] to integrate patients’ structured data characteristics to predict the risk of cerebral infarction. Hao et al [<xref ref-type="bibr" rid="ref13">13</xref>] proposed a diagnostic modeling and reasoning system using the dynamic uncertain causality graph and improved the diagnostic accuracy of jaundice. Jeddi et al [<xref ref-type="bibr" rid="ref14">14</xref>] applied the C5.0 algorithm to draw a multibranch decision tree used to aid in the diagnosis of complicated skin diseases.</p>
      <p>When the scale of the training data is limited in a traditional neural network, the advantage of using external knowledge is more obvious. These methods ignore the fact that neural networks and external knowledge can benefit from each other.</p>
      <p>The rapid development of computer technology and biotechnology has enabled the rapid growth of biomedical text resources. These resources contain valuable knowledge that can be used to promote the development of medical informatics. A doctor’s diagnostic process is a combination of their own clinical experience and general medical knowledge. Therefore, medical knowledge is indispensable in the diagnosis process. Fang et al [<xref ref-type="bibr" rid="ref15">15</xref>] proposed a method to diagnose chronic obstructive pulmonary disease based on a knowledge graph and integrated models. Liang et al [<xref ref-type="bibr" rid="ref1">1</xref>] designed a system framework for the data mining of EMRs based on pediatric diseases. This framework combines medical knowledge with a data-driven model and uses logistic regression for the disease hierarchical diagnosis. These efforts provide new methods for medical data analysis, but intelligent diagnosis based on EMRs is still hindered by the following problems:</p>
      <list list-type="bullet">
        <list-item>
          <p>An EMR usually involves multiple diagnostic results, such as normal diagnosis, pathological diagnosis, and complications.</p>
        </list-item>
        <list-item>
          <p>In the aspect of external knowledge, the above methods simply splice the knowledge with the model, which fails to capture the key information well and requires a large number of calculations.</p>
        </list-item>
        <list-item>
          <p>To achieve the most advanced performance, doctors not only care about the diagnostic results but also need to know what medical knowledge contributed to the diagnosis.</p>
        </list-item>
      </list>
      <p>Therefore, in this paper, we design a novel intelligent diagnosis model based on deep learning. Specifically, to capture the important details of the original documents, we use bidirectional gated recurrent units (Bi-GRUs) [<xref ref-type="bibr" rid="ref16">16</xref>] with a hierarchical attention mechanism to model the correlations among words and sentences in EMRs and knowledge documents. Given an analysis of the correlation between the EMRs and medical knowledge documents, we select the most supportive external knowledge to support intelligent diagnosis. Considering the diversity of diagnostic results, we need to conduct intelligent diagnosis in the multilabel classification paradigm. The major contributions of this paper are summarized as follows:</p>
      <list list-type="bullet">
        <list-item>
          <p>Knowledge-aware hierarchical diagnosis model (KHDM) makes full use of the hierarchical deep language model to encode the EMRs and external knowledge documents.</p>
        </list-item>
        <list-item>
          <p>Language model is enriched with high-quality knowledge, combining the advantages of both to perform a knowledge-aware diagnosis.</p>
        </list-item>
        <list-item>
          <p>Experimental results on real-word Chinese obstetric EMRs achieve superior performance over baselines. In addition, we discuss the importance and interpretability of external medical knowledge.</p>
        </list-item>
      </list>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>KHDM contains the following steps, as depicted in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Overview of knowledge-aware hierarchical diagnosis model.</p>
          </caption>
          <graphic xlink:href="medinform_v9i5e25304_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <list list-type="order">
          <list-item>
            <p>Enter the EMR into the document encoder to obtain the document embedding <italic>e</italic> and concatenate it with the numerical features <italic>n</italic> to get the final EMR embedding <italic>e′</italic>.</p>
          </list-item>
          <list-item>
            <p>Input the EMRs and external knowledge documents into the knowledge filter for preliminary screening of the external knowledge, and send the filtered knowledge documents to the document encoder to obtain the knowledge embedding <italic>k</italic>.</p>
          </list-item>
          <list-item>
            <p>Input the EMR embedding and knowledge embedding jointly into the knowledge aggregator. Through the simultaneous analysis of the EMRs and knowledge documents, our model learns a knowledge-side attention component in order to carefully select the most supportive knowledge document <italic>k′</italic> from the external knowledge to support intelligent diagnosis.</p>
          </list-item>
          <list-item>
            <p><italic>e′</italic> and <italic>k′</italic> are concatenated and passed to a sigmoid classifier for the diagnosis. In this section, we introduce the document encoder, knowledge attention module (including the knowledge filter and knowledge aggregator), and output.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Document Encoder</title>
        <p>The purpose of the document encoder is to encode the original EMRs and knowledge documents into continuous low-dimensional embeddings to capture semantic relationships. EMRs and medical knowledge documents usually have potential hierarchical structures. A document consists of several sentences, and a sentence consists of several words. Intuitively, the document embedding problem can be converted into two sequence embedding problems [<xref ref-type="bibr" rid="ref17">17</xref>]. Modeling the semantics of the EMR and external knowledge by word-level and sentence-level representations can fully capture the hierarchical laws and dependencies.</p>
        <p>The words and sentences in a document provide different information and have different degrees of importance. Inspired by Yang et al [<xref ref-type="bibr" rid="ref18">18</xref>], we successively apply the attention mechanism [<xref ref-type="bibr" rid="ref19">19</xref>] at the word level and sentence level so that it can differentiate more important information when constructing the document representation. The attention mechanism not only improves the performance of the deep learning model but also intuitively shows the contributions of words and sentences to the classification decision.</p>
        <p>We use the Bi-GRU sequence encoder with an attention mechanism to encode the EMRs and knowledge documents. Numerical features, such as physiological indicators and laboratory results, are also important in EMRs. To enable more complete use of the EMRs, we separately extract the numerical features and concatenate them with EMRs. Next, we introduce the Bi-GRU sequence encoder, attention encoder, and numerical features in detail.</p>
        <p>Although the word-level and sentence-level encoders can have different structures, we use the same structure here for simplicity, as shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Document encoder framework.</p>
          </caption>
          <graphic xlink:href="medinform_v9i5e25304_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Bi-GRU Sequence Encoder</title>
        <p>The importance of words and sentences is highly context dependent. In other words, the same words or sentences may have different degrees of importance in different contexts. We model the semantics of EMRs and external knowledge documents by including word-level and sentence-level representations that can fully capture hierarchical dependencies. Taking the word level as an example, we use Bi-GRU to make a word compilation of the meaning of an entire sentence, where the GRU uses a gate control mechanism to memorize the information of the previous cells.</p>
        <p>The GRU has two gates: the reset gate <italic>r<sub>t</sub></italic> and the update gate <italic>z<sub>t</sub></italic>. The reset gate is used to determine the degree to which the previous information is forgotten, and the update gate is used to decide which information to forget and which new information to enter. <italic>r<sub>t</sub></italic> and <italic>z<sub>t</sub></italic> jointly control the calculation from hidden state <italic>h<sub>t</sub></italic><sub>–1</sub> to hidden state <italic>h<sub>t</sub></italic>. <italic>h<sub>t</sub>̃</italic> is a candidate hidden layer. At time t, the GRU is calculated as follows:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v9i5e25304_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>where W<sub>*</sub> is the weight matrix. <italic>x<sub>t</sub></italic> is the sequence vector at time t, and σ is the activation sigmoid function that converts the values of each cell state into the range of 0 to 1 to act as a gate signal. The reset gate <italic>r<sub>t</sub></italic> receives the values of <italic>h<sub>t</sub></italic><sub>–1</sub> and <italic>x<sub>t</sub></italic>. If <italic>r<sub>t</sub></italic> is zero, then the previous state is not saved. In other words, at this time, <italic>h<sub>t</sub> ̃</italic> only contains the information of the current word. Afterward, the update gate <italic>z<sub>t</sub></italic> controls how much information needs to be forgotten from the hidden state <italic>h<sub>t</sub></italic><sub>–1</sub> at the previous moment and how much hidden layer information <italic>h<sub>t</sub> ̃</italic> needs to be added at the moment. The final hidden layer information <italic>h<sub>t</sub></italic> can then be output.</p>
        <p>Bi-GRU uses forward and backward GRUs to encode the sequence in two directions so that the associations between different words (sentences) are taken into account when encoding. Specifically, consider an EMR e = [s<sub>1</sub>,s<sub>2</sub>,<sup>...</sup>,s<sub>L</sub>], where L is the number of sentences and s<sub>i</sub>(1 ≤ <italic>I</italic> ≤ <italic>L</italic>) represents the <italic>i<sup>th</sup></italic> sentence in the document. For each sentence in the document s<sub>i</sub> = [w<sub>i1</sub>,w<sub>i2</sub>,<sup>...</sup>,w<sub>iT</sub>], w<sub>im</sub>(1 ≤ m ≤ T) represents the <italic>m<sup>th</sup></italic> word in s<sub>i</sub>. w<sub>im</sub> is the embedding representation of w<sub>im</sub>, and the encoding method is to concatenate the feature representations of Bi-GRU; that is, the forward hidden state <italic>h<sub>it</sub></italic><sup>→</sup> and backward hidden state <italic>h<sub>it</sub></italic><sup>←</sup> at time t are weighted sums:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v9i5e25304_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
      </sec>
      <sec>
        <title>Attention Encoder</title>
        <p>Not all words have the same effect on the meaning of a sentence, as is the case for sentences within documents. The attention mechanism has become an effective mechanism for mining local differences and highlighting vital elements of data. Therefore, we add an attention mechanism at the word and sentence levels to indicate their importance to the previous level. Compared with the general word-level attention mechanism, the sentence-level attention mechanism plays a more important role in medical documents because certain domain phrases often appear. At the word level, the attention mechanism is introduced to extract those words that are important to the meaning of the sentence, and the representations of these informative words are aggregated to form a sentence vector. The final sentence vector representation <italic>s<sub>i</sub></italic> is defined as follows:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v9i5e25304_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>where the weight <italic>a<sub>it</sub></italic> indicates the importance of a word to the meaning of the sentence. The context vector <italic>u<sub>w</sub></italic> is an attention matrix obtained by a random initialization method. It is a cumulative sum of the different probability weights assigned by the attention mechanism and the performance of each hidden layer state. We measure the importance of the word as similarity of <italic>w<sub>it</sub></italic> with a word-level context vector <italic>u<sub>w</sub></italic> and get a normalized importance weight <italic>α<sub>it</sub></italic> through a softmax function. We use the same method to obtain the context-level representation of <italic>u<sub>s</sub></italic> and finally to obtain the document vector <italic>e</italic>:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v9i5e25304_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
      </sec>
      <sec>
        <title>Numerical Features</title>
        <p>Numerical features are very important indicators in Chinese obstetric EMRs. For example, physiological indicators such as the age of the pregnant woman, the number of menopause months, and the uterine height are important factors affecting the clinical judgement. However, there are some cases where the numerical units of EMRs are not uniform. Taking the number of menopause months as an example, it is generally described as “menopause X months,” but some EMRs also use the description method “menopause Y weeks,” We unified the units of this indicator as months, relying on the equation that “4 weeks” is approximately “1 month” in the feature extraction. We also need to consider the validity of the data. According to medical professional knowledge, numerical features have a certain value range. For example, when extracting the physiological parameters of a pregnant woman’s uterine height, if a value is found to be “29 m,” it can be speculated that this data point is incorrect, which will affect the experimental results. This paper determines the accuracy of the data by setting thresholds for each physiological index, and the error data are directly deleted. Detailed thresholds descriptions are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. After extracting the numerical features <italic>n</italic>, they are concatenated with the document vector <italic>e</italic> as the final representation of the EMR:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v9i5e25304_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
      </sec>
      <sec>
        <title>Knowledge Attention Module</title>
        <p>Integrating all the external knowledge into the model is very time-consuming, and not all knowledge has enough discernment to support the final classification. Our knowledge attention module aims to alleviate these problems, ensuring that our model can select reliable and useful knowledge for each candidate. This module consists of a knowledge filter and knowledge aggregator. The knowledge filter can preliminarily filter out irrelevant knowledge documents, and the knowledge aggregator uses the attention mechanism to select the most supported knowledge. Considering that external knowledge has too much noise, such an attention mechanism explores the correlation between the EMRs and knowledge documents. KHDM mainly uses this module to make a knowledge-aware diagnosis.</p>
        <sec>
          <title>Knowledge Filter</title>
          <p>We consider the task of the knowledge filter to be text similarity calculation. By calculating the similarity between the input EMRs and the medical knowledge documents, the knowledge not related to the input EMRs will be filtered out. Due to the special nature of medical texts, symptoms and diagnostic methods vary by disease. Therefore, we use the term frequency–inverse document frequency (TF-IDF) to extract the text features of the EMRs and external knowledge. TF(x) represents word frequency, which counts the frequency of each word in an EMR. IDF(x) represents the inverse text frequency and returns the frequency of word x in the corpus, reflecting the importance of words in the text:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v9i5e25304_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>where N(x) represents the number of occurrences of word x in the document, N is the total number of words in the document, and D is the total number of documents. D(x) indicates how many documents the word x appears in. Due to professionalism in the medical field, the IDF is smoothed so that domain words that do not appear in all documents can also obtain a suitable IDF value:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v9i5e25304_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>The set of documents and knowledge is then viewed as a set of vectors in a vector space. The cosine function is used to measure the similarity between the document and any knowledge. If the similarity score is less than 0.5, we consider these knowledge documents irrelevant and vice versa. After that, we use the document encoder mentioned above to encode the relevant knowledge document. Finally, we obtain the relevant knowledge vector representation: <italic>k</italic> = [<italic>k</italic><sub>1</sub>,<italic>k</italic><sub>2</sub>,<sup>...</sup>,<italic>k</italic><sub>j</sub>].</p>
        </sec>
        <sec>
          <title>Knowledge Aggregator</title>
          <p>This submodule aims to find further medical knowledge that supports intelligent diagnosis and generates an aggregated knowledge embedding <italic>k</italic>′. Therefore, we use the attention mechanism to select the key knowledge documents that are the most critical to the task objective. When generating an aggregated knowledge embedding, more attention is paid to the most important knowledge:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v9i5e25304_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>The attention weight <italic>α<sub>t</sub></italic> generated by <italic>k<sub>t</sub></italic> and <italic>e</italic>′ can be regarded as the correlation between the external knowledge and the input EMRs. The top <italic>k</italic>-related knowledge is selected according to the attention weight after sorting. The number of related knowledge documents less than <italic>k</italic> will be padded with zero vectors. We define <italic>k</italic> as the average label number per document.</p>
        </sec>
      </sec>
      <sec>
        <title>Output</title>
        <p>To make the final diagnosis prediction, we first concatenate the EMR embedding <italic>e</italic>′ and the knowledge embedding <italic>k</italic>′ and feed it into two fully connected layers to generate a new vector, which is then passed to a sigmoid classifier to produce the predicted results. We consider that all diseases with an output probability greater than τ are positive predictions. The input to the first fully connected layer can also be only <italic>e</italic>′ or <italic>k</italic>′, which means we use only EMRs or external knowledge to make the diagnosis. The loss function for the training is the cross entropy:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v9i5e25304_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Dataset Details</title>
        <p>We collected 24,192 Chinese obstetric EMRs randomly selected by multiple hospitals as the research material, and each EMR corresponds to one patient. Due to the different writing habits of doctors, there are many different forms of expression for the same diagnostic results. Therefore, the medical thesaurus <italic>International Classification of Diseases, Tenth Revision</italic> [<xref ref-type="bibr" rid="ref20">20</xref>] is used as the basis for the standardization of disease naming. To protect the privacy of patients, personal identifying information such names and ID numbers of patients was removed [<xref ref-type="bibr" rid="ref21">21</xref>]. The dataset focuses on inpatient department data and consists primarily of structured and unstructured text data. Structured data include the basic information on the patient such as age, ethnicity, and laboratory examination data. Unstructured data mainly refer to the patient’s main complaint, admission, and physical examination. Detailed data descriptions are shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. The dataset contains 59 types of disease diagnostic results and is divided into 21,772 training sets and 2420 test sets according to the results distribution.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Chinese obstetric electronic medical record sample.</p>
          </caption>
          <graphic xlink:href="medinform_v9i5e25304_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>For external knowledge, we collected descriptions of medical concepts from the authoritative textbook <italic>Obstetrics and Gynecology</italic> [<xref ref-type="bibr" rid="ref22">22</xref>] and a medical encyclopedia. The medical concepts mainly include the disease definition, symptoms, and treatment methods. In the end, we collect a total of 72 medical definition documents that make up our external knowledge. All external knowledge was chosen under the guidance of medical experts.</p>
      </sec>
      <sec>
        <title>Hyperparameter Setting</title>
        <p>Since all EMRs and external knowledge documents are written in Chinese, we first use PKUSEG [<xref ref-type="bibr" rid="ref23">23</xref>] to segment the document and set the maximum document length to 1600 characters. We use the GloVe [<xref ref-type="bibr" rid="ref24">24</xref>] model to train word embedding on the corpus of EMRs after word segmentation. The hidden state size of the GRU is set to 100. For text convolutional neural network (TextCNN), this paper sets the filter width to (2, 3, 4, 5), and each filter size is 25 to maintain consistency. After the connection, the representation size of our model becomes 200. Finally, a 200 * <italic>c</italic> fully connected layer is added (<italic>c</italic> is the number of labels).</p>
        <p>Since we use the sigmoid function for classification, the prediction threshold τ is set to 0.5. Average label number per document <italic>k</italic> is 2.688, so we set <italic>k</italic> = 3. We use Adam [<xref ref-type="bibr" rid="ref25">25</xref>] as the optimizer. During the training period, EMRs are selected by random sampling method. We set the learning rate to 0.001 and the batch size to 32.</p>
      </sec>
      <sec>
        <title>Performance on an Obstetric EMR Dataset</title>
        <p>In multilabel learning, each sample may have multiple category labels. Many evaluation metrics for multilabel learning have been proposed [<xref ref-type="bibr" rid="ref26">26</xref>]. We use the average precision, 1-error, hamming loss, ranking loss, and coverage as evaluation metrics. The following text classification models were used as baselines for comparison:</p>
        <list list-type="bullet">
          <list-item>
            <p>Classifier chains [<xref ref-type="bibr" rid="ref27">27</xref>] integrate multiple single classification methods into one model to solve the problem of multilabel classification.</p>
          </list-item>
          <list-item>
            <p>Multilabel k–nearest neighbor [<xref ref-type="bibr" rid="ref28">28</xref>] considers the k instances with the smallest distance from the new instance in the feature space as a set.</p>
          </list-item>
          <list-item>
            <p>Long short-term memory (LSTM) [<xref ref-type="bibr" rid="ref29">29</xref>] uses the last hidden state as the representation of the whole document.</p>
          </list-item>
          <list-item>
            <p>Bidirectional long short-term memory (Bi-LSTM) is a bidirectional LSTM that can obtain long-term context information in the direction of the input.</p>
          </list-item>
          <list-item>
            <p>TextCNN [<xref ref-type="bibr" rid="ref9">9</xref>] uses multiple kernels of different sizes to extract the key information in sentences to better capture the local relevance.</p>
          </list-item>
        </list>
        <p>All text classification models are trained in the multilabel framework. The experimental results on the Chinese obstetric EMR dataset are summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Comparative results on Chinese obstetric electronic medical record dataset.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td>Average precision</td>
                <td>1-error</td>
                <td>Hamming loss</td>
                <td>Ranking loss</td>
                <td>Coverage</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>CC<sup>a</sup></td>
                <td>0.5083</td>
                <td>0.4880</td>
                <td>0.0308</td>
                <td>0.1366</td>
                <td>19.7917</td>
              </tr>
              <tr valign="top">
                <td>ML-KNN<sup>b</sup></td>
                <td>0.6109</td>
                <td>0.2488</td>
                <td>0.0258</td>
                <td>0.0709</td>
                <td>10.2347</td>
              </tr>
              <tr valign="top">
                <td>LSTM<sup>c</sup></td>
                <td>0.8651</td>
                <td>0.0836</td>
                <td>0.0166</td>
                <td>0.0190</td>
                <td>4.4612</td>
              </tr>
              <tr valign="top">
                <td>Bi-LSTM<sup>d</sup></td>
                <td>0.8721</td>
                <td>0.0775</td>
                <td>0.0164</td>
                <td>0.0186</td>
                <td>4.4625</td>
              </tr>
              <tr valign="top">
                <td>TextCNN<sup>e</sup></td>
                <td>0.8652</td>
                <td>0.0961</td>
                <td>0.0188</td>
                <td>0.0203</td>
                <td>4.6035</td>
              </tr>
              <tr valign="top">
                <td>KHDM<sup>f</sup></td>
                <td>0.8929</td>
                <td>0.0713</td>
                <td>0.0156</td>
                <td>0.0165</td>
                <td>4.0833</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>CC: classifier chains.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>ML-KNN: multilabel k–nearest neighbor.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>Bi-LSTM: bidirectional long short-term memory.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>TextCNN: text convolutional neural networks.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>KHDM: knowledge-aware hierarchical diagnosis model.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>According to the experimental results, compared with the traditional machine learning methods, the neural network method has achieved better results. The main reason is that the neural network can capture richer features and deeper semantic information. Considering the structured context information, a bidirectional network can significantly improve the performance. For example, Bi-LSTM gives an average precision of 0.8721, while that of the LSTM is 0.8651. In addition, our model is largely superior to other traditional neural network methods. The TextCNN is usually connected to the pooling layer after the convolution layer. Its operation logic is to retain the strongest features from the feature vectors obtained from a convolution kernel so it cannot retain the relative position information of the original input, resulting in information loss. LSTM has a sequence dependency problem and does not perform well when the document is too long. Our model uses a hierarchical structure to divide the document into sentences without the problems of distance dependence and information loss. In general, our model is much better than the other models in all of the evaluation metrics applied, with improvements of 3% to 30%. Making full use of the attention mechanism to integrate external medical knowledge is undoubtedly an important way to improve the effectiveness of intelligent diagnosis.</p>
      </sec>
      <sec>
        <title>Performance on Public Dataset</title>
        <p>This paper takes the obstetric intelligent diagnosis problem into a multilabel classification framework. Therefore, we test the classification effect on two public datasets: DeliciousMIL [<xref ref-type="bibr" rid="ref30">30</xref>] and Hep categories. The former consists of a number of tagged pages on the social bookmarking site delicious.com, with categories including programming, style, and reference, and the latter is a public multilabel dataset available on Magpie, with subject categories relevant to high-energy physics (HEP) abstracts, including astrophysics, experiment-HEP, gravitation and cosmology, phenomenology-HEP, and theory-HEP. <xref ref-type="table" rid="table2">Table 2</xref> provides a brief description of each dataset. The selected external knowledge <italic>k</italic> values of the two datasets are 3 and 1, respectively.</p>
        <p>The external knowledge data for the DeliciousMIL and Hep categories datasets are derived from Wikipedia entry definitions. <xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="table" rid="table4">Table 4</xref> present the results. Similar to the results on the obstetric EMR dataset, it can be clearly observed that our model performs best in multilabel text classification, proving that KHDM is universal for text classification tasks.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Description of public datasets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Dataset</td>
                <td>Field</td>
                <td>Instances</td>
                <td>Labels</td>
                <td>AL<sup>a</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>DeliciousMIL</td>
                <td>Social networking sites</td>
                <td>12,234</td>
                <td>20</td>
                <td>2.9574</td>
              </tr>
              <tr valign="top">
                <td>Hep categories</td>
                <td>High-energy physics</td>
                <td>1000</td>
                <td>5</td>
                <td>1.1920</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>AL: average label number per document.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Comparative results on public dataset DeliciousMIL.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td>Average precision</td>
                <td>1-error</td>
                <td>Hamming loss</td>
                <td>Ranking loss</td>
                <td>Coverage</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>CC<sup>a</sup></td>
                <td>0.3208</td>
                <td>0.8134</td>
                <td>0.2054</td>
                <td>0.4183</td>
                <td>12.9241</td>
              </tr>
              <tr valign="top">
                <td>ML-KNN<sup>b</sup></td>
                <td>0.3703</td>
                <td>0.7621</td>
                <td>0.4748</td>
                <td>0.3488</td>
                <td>11.0213</td>
              </tr>
              <tr valign="top">
                <td>LSTM<sup>c</sup></td>
                <td>0.5813</td>
                <td>0.3947</td>
                <td>0.1641</td>
                <td>0.1518</td>
                <td>6.9928</td>
              </tr>
              <tr valign="top">
                <td>Bi-LSTM<sup>d</sup></td>
                <td>0.5968</td>
                <td>0.3786</td>
                <td>0.1610</td>
                <td>0.1615</td>
                <td>6.9648</td>
              </tr>
              <tr valign="top">
                <td>TextCNN<sup>e</sup></td>
                <td>0.6299</td>
                <td>0.3639</td>
                <td>0.1760</td>
                <td>0.1344</td>
                <td>6.0637</td>
              </tr>
              <tr valign="top">
                <td>KHDM<sup>f</sup></td>
                <td>0.6386</td>
                <td>0.3312</td>
                <td>0.1255</td>
                <td>0.1284</td>
                <td>5.9101</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>CC: classifier chains.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>ML-KNN: multilabel k–nearest neighbor.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>Bi-LSTM: bidirectional long short-term memory.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>TextCNN: text convolutional neural networks.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>KHDM: knowledge-aware hierarchical diagnosis model.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Comparative results on public dataset Hep categories.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td>Average precision</td>
                <td>1-error</td>
                <td>Hamming loss</td>
                <td>Ranking loss</td>
                <td>Coverage</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>CC<sup>a</sup></td>
                <td>0.5606</td>
                <td>0.6290</td>
                <td>0.2982</td>
                <td>0.4381</td>
                <td>1.9410</td>
              </tr>
              <tr valign="top">
                <td>ML-KNN<sup>b</sup></td>
                <td>0.5733</td>
                <td>0.5800</td>
                <td>0.3460</td>
                <td>0.4433</td>
                <td>2.2300</td>
              </tr>
              <tr valign="top">
                <td>LSTM<sup>c</sup></td>
                <td>0.6807</td>
                <td>0.5422</td>
                <td>0.2740</td>
                <td>0.2437</td>
                <td>0.9642</td>
              </tr>
              <tr valign="top">
                <td>Bi-LSTM<sup>d</sup></td>
                <td>0.7055</td>
                <td>0.4816</td>
                <td>0.2200</td>
                <td>0.2251</td>
                <td>0.9455</td>
              </tr>
              <tr valign="top">
                <td>TextCNN<sup>e</sup></td>
                <td>0.7903</td>
                <td>0.3429</td>
                <td>0.2420</td>
                <td>0.1550</td>
                <td>0.6207</td>
              </tr>
              <tr valign="top">
                <td>KHDM<sup>f</sup></td>
                <td>0.8929</td>
                <td>0.0713</td>
                <td>0.0156</td>
                <td>0.0165</td>
                <td>4.0833</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>CC: classifier chains.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>ML-KNN: multilabel k–nearest neighbor.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>Bi-LSTM: bidirectional long short-term memory.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>TextCNN: text convolutional neural networks.</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>KHDM: knowledge-aware hierarchical diagnosis model.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Ablation Test</title>
        <p>KHDM is a combination of a knowledge attention mechanism and external medical knowledge representation. We conducted an ablation test to assess the contributions of these two components in our model. <xref ref-type="table" rid="table5">Table 5</xref> presents the performance of our model and its ablations on the obstetric EMR dataset. <italic>w/o Knowledge</italic> means using only the EMRs for the intelligent diagnosis, and <italic>w/o Att</italic> means we remove the attention mechanism and all the medical knowledge documents directly concatenated with the EMRs and do not use the knowledge attention module.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Results of the ablation test.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td>Average precision</td>
                <td>One error</td>
                <td>Hamming loss</td>
                <td>Ranking loss</td>
                <td>Coverage</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>w/o Knowledge</td>
                <td>0.8789</td>
                <td>0.1047</td>
                <td>0.0184</td>
                <td>0.0212</td>
                <td>4.2364</td>
              </tr>
              <tr valign="top">
                <td>w/o Att<sup>a</sup></td>
                <td>0.8519</td>
                <td>0.1022</td>
                <td>0.0164</td>
                <td>0.0181</td>
                <td>4.3210</td>
              </tr>
              <tr valign="top">
                <td>TextCNN<sup>b</sup></td>
                <td>0.8652</td>
                <td>0.0961</td>
                <td>0.0188</td>
                <td>0.0203</td>
                <td>4.6035</td>
              </tr>
              <tr valign="top">
                <td>TextCNN + knowledge</td>
                <td>0.8700</td>
                <td>0.0912</td>
                <td>0.0167</td>
                <td>0.0199</td>
                <td>4.3516</td>
              </tr>
              <tr valign="top">
                <td>KHDM<sup>c</sup></td>
                <td>0.8929</td>
                <td>0.0713</td>
                <td>0.0156</td>
                <td>0.0165</td>
                <td>4.0833</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>Att: attention.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>TextCNN: text convolutional neural networks.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>KHDM: knowledge-aware hierarchical diagnosis model.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>From the experimental results, the following can be seen:</p>
        <list list-type="bullet">
          <list-item>
            <p>When the external knowledge is not introduced or the attention mechanism is not used, the model performance deteriorates.</p>
          </list-item>
          <list-item>
            <p>The models incorporating knowledge are superior to ordinary text classification models with a drop to 0.8789 of model <italic>w/o Knowledge</italic> after the supplementary knowledge is removed. The effectiveness of using external knowledge information is confirmed, and medical knowledge contributes to intelligent diagnosis.</p>
          </list-item>
          <list-item>
            <p>When fusing the medical knowledge, performances of .<italic>w/o Att</italic> and <italic>TextCNN + knowledge</italic> significantly increase by simply concatenating the knowledge document.</p>
          </list-item>
        </list>
        <p>However, these models do not use the knowledge attention mechanism but directly concatenate with the external knowledge, which will introduce a large amount of noise. We can see KHDM improves more than 2 percentage points on most evaluation metrics. These ablation test results reflect the importance and rationality of using the attention mechanism to capture the interactions between multiple inputs.</p>
      </sec>
      <sec>
        <title>Interpretability of the Attention Mechanism</title>
        <p>Interpretability is very important for model evaluation, especially in the medical field, as it allows doctors to understand the rationale behind the diagnostic results. To verify that our model can capture the most important sentences and words in a document, we first visualized the hierarchical attention mechanism in the document encoder on the Chinese obstetric EMR dataset.</p>
        <p>As shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>, every line is a sentence, and we normalize the sentence weights and word weights to ensure that only the important words in the most important sentences are emphasized. Red denotes the weight of a sentence and blue denotes the weight of a word, where the darker the color is, the greater the weight. We know that doctors often diagnose patients by analyzing their clinical symptoms and test results. Our model accurately locates the words <italic>abdominal pain</italic> and <italic>no yellow stain</italic> and their corresponding sentences.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Visualization of attention in document encoder (attention encoder).</p>
          </caption>
          <graphic xlink:href="medinform_v9i5e25304_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Next, we choose a representative example to illustrate the role of the attention mechanism in the knowledge aggregator. We remove all attention values less than 10<sup>–3</sup> from the visualization. As can be seen in <xref rid="figure5" ref-type="fig">Figure 5</xref>, our model pays more attention to the clinical symptom <italic>blood (red part)</italic> and site <italic>cervix (green part)</italic> within the medical knowledge. The darker the color of the line, the higher the attention. Similarly, medical concepts are essential in clinical diagnosis, so medical knowledge with a higher attention score through localization of symptoms and sites will be selected.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Visualization of attention in knowledge aggregator (knowledge attention).</p>
          </caption>
          <graphic xlink:href="medinform_v9i5e25304_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>We used only external medical knowledge related to obstetric diseases, but obstetric diagnosis also involves immunology, cytology, genetics, pathology, and other multilevel knowledge. For cardiovascular and cerebrovascular diseases requiring blood pressure and routine blood tests, the numerical features are very important for the diagnosis, and our proposed method provide support. These numerical features are very important for the diagnosis, and our proposed method can provide support. But for diseases such as cancer, text data alone is not enough and must be combined with other types of medical information such as medical images and signals. To improve the interpretability of intelligent diagnosis model, communication with the clinic and selection of an appropriate interpretation method in terms of complementing the doctor’s workflow and habits is still necessary. Another limitation that needs to be addressed in achieving intelligent diagnosis based on EMRs is imbalanced datasets. This paper selects common diseases as the research object. In future work, we will focus on diseases with lower frequency.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this paper, we propose KHDM that synchronously and effectively uses Chinese obstetric EMRs and external knowledge. Particularly, the use of the knowledge attention module to selectively leverage medical knowledge not only improves performance but also provides a basis for intelligent diagnosis. The experimental results on a real obstetric EMR dataset show that KHDM can effectively use external knowledge to enhance the language model, thereby improving the performance.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Threshold descriptions in numerical features.</p>
        <media xlink:href="medinform_v9i5e25304_app1.docx" xlink:title="DOCX File , 14 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">Bi-GRU</term>
          <def>
            <p>bidirectional gated recurrent unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">Bi-LSTM</term>
          <def>
            <p>bidirectional long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">HEP</term>
          <def>
            <p>high-energy physics</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">KHDM</term>
          <def>
            <p>knowledge-aware hierarchical diagnosis model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">TextCNN</term>
          <def>
            <p>text convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">TF-IDF</term>
          <def>
            <p>term frequency–inverse document frequency</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work is supported by grant 18ZDA315 from the Major Program of the National Social Science Foundation of China, grant 2019TQ0286v from the China Postdoctoral Science Foundation, grant 192102210260 from the Science and Technique Program of Henan Province, grant SB201901021from the Medical Science and Technique Program Cosponsored by Henan Province and Ministry, and grants 19A520003 and 20A520038 from the Key Scientific Research Program of Higher Education of Henan Province. The funding body did not play any role in the design of the study; collection, analysis, and interpretation of data; or in writing the manuscript.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tsui</surname>
              <given-names>BY</given-names>
            </name>
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Valentim</surname>
              <given-names>CCS</given-names>
            </name>
            <name name-style="western">
              <surname>Baxter</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kermany</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hewett</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ling</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gibson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>ED</given-names>
            </name>
            <name name-style="western">
              <surname>Karin</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pizzato</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Haw</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Goldbaum</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tremoulet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Evaluation and accurate diagnoses of pediatric diseases using artificial intelligence</article-title>
          <source>Nat Med</source>
          <year>2019</year>
          <month>03</month>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>433</fpage>
          <lpage>438</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-018-0335-9</pub-id>
          <pub-id pub-id-type="medline">30742121</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-018-0335-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hornberger</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Electronic health records: a guide for clinicians and administrators</article-title>
          <source>JAMA</source>
          <year>2009</year>
          <month>01</month>
          <day>07</day>
          <volume>301</volume>
          <issue>1</issue>
          <fpage>110</fpage>
          <pub-id pub-id-type="doi">10.1001/jama.2008.910</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Turchin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shubina</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Breydo</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Pendergrass</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Einbinder</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Comparison of information content of structured and narrative text data sources on the example of medication intensification</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2009</year>
          <volume>16</volume>
          <issue>3</issue>
          <fpage>362</fpage>
          <lpage>370</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/19261947"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M2777</pub-id>
          <pub-id pub-id-type="medline">19261947</pub-id>
          <pub-id pub-id-type="pii">M2777</pub-id>
          <pub-id pub-id-type="pmcid">PMC2732236</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Piao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>HG</given-names>
            </name>
            <name name-style="western">
              <surname>Pok</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A data mining approach for dyslipidemia disease prediction using carotid arterial feature vectors</article-title>
          <year>2010</year>
          <conf-name>International Conference on Computer Engineering and Technology</conf-name>
          <conf-date>2010</conf-date>
          <conf-loc>Chengdu</conf-loc>
          <fpage>16</fpage>
          <lpage>18</lpage>
          <pub-id pub-id-type="doi">10.1109/iccet.2010.5485249</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goldstein</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Specializing for predicting obesity and its co-morbidities</article-title>
          <source>J Biomed Inform</source>
          <year>2009</year>
          <month>10</month>
          <volume>42</volume>
          <issue>5</issue>
          <fpage>873</fpage>
          <lpage>886</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(08)00143-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2008.11.001</pub-id>
          <pub-id pub-id-type="medline">19041423</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(08)00143-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC3253373</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pattekari</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Parveen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Prediction system for heart disease using Naïve Bayes</article-title>
          <source>Int J Adv Comput Math Sci</source>
          <year>2012</year>
          <volume>3</volume>
          <issue>3</issue>
          <fpage>290</fpage>
          <lpage>294</lpage>
          <pub-id pub-id-type="doi">10.1109/CIMCA.2016.8053261</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roopa</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Asha</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A linear model based on principal component analysis for disease prediction</article-title>
          <source>IEEE Access</source>
          <year>2019</year>
          <volume>7</volume>
          <fpage>105314</fpage>
          <lpage>105318</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2019.2931956</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Clinical assistant diagnosis for electronic medical record based on convolutional neural network</article-title>
          <source>Sci Rep</source>
          <year>2018</year>
          <month>04</month>
          <day>20</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>6329</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-018-24389-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-018-24389-w</pub-id>
          <pub-id pub-id-type="medline">29679019</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-018-24389-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC5910396</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Convolutional neural networks for sentence classification</article-title>
          <year>2014</year>
          <conf-name>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>October 25-29</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1746</fpage>
          <lpage>1751</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Breast cancer classification with electronic medical records using hierarchical attention bidirectional networks</article-title>
          <year>2018</year>
          <conf-name>2018 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name>
          <conf-date>December 3-6</conf-date>
          <conf-loc>Madrid, Spain</conf-loc>
          <fpage>983</fpage>
          <lpage>988</lpage>
          <pub-id pub-id-type="doi">10.1109/bibm.2018.8621479</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Usama</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hossain</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Ghoneim</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Recurrent convolutional neural network based multimodal disease risk prediction</article-title>
          <source>Future Generation Computer Systems</source>
          <year>2019</year>
          <month>03</month>
          <volume>92</volume>
          <fpage>76</fpage>
          <lpage>83</lpage>
          <pub-id pub-id-type="doi">10.1016/j.future.2018.09.031</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Salakhutdinov</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <article-title>Reducing the dimensionality of data with neural networks</article-title>
          <source>Science</source>
          <year>2006</year>
          <month>07</month>
          <day>28</day>
          <volume>313</volume>
          <issue>5786</issue>
          <fpage>504</fpage>
          <lpage>507</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.sciencemag.org/cgi/pmidlookup?view=long&#38;pmid=16873662"/>
          </comment>
          <pub-id pub-id-type="doi">10.1126/science.1127647</pub-id>
          <pub-id pub-id-type="medline">16873662</pub-id>
          <pub-id pub-id-type="pii">313/5786/504</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Geng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Intelligent diagnosis of jaundice with dynamic uncertain causality graph model</article-title>
          <source>J Zhejiang Univ Sci B</source>
          <year>2017</year>
          <month>05</month>
          <volume>18</volume>
          <issue>5</issue>
          <fpage>393</fpage>
          <lpage>401</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28471111"/>
          </comment>
          <pub-id pub-id-type="doi">10.1631/jzus.B1600273</pub-id>
          <pub-id pub-id-type="medline">28471111</pub-id>
          <pub-id pub-id-type="pmcid">PMC5442976</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeddi</surname>
              <given-names>FR</given-names>
            </name>
            <name name-style="western">
              <surname>Arabfard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kermany</surname>
              <given-names>ZA</given-names>
            </name>
          </person-group>
          <article-title>Intelligent diagnostic assistant for complicated skin diseases through C5's algorithm</article-title>
          <source>Acta Inform Med</source>
          <year>2017</year>
          <month>09</month>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>182</fpage>
          <lpage>186</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29114111"/>
          </comment>
          <pub-id pub-id-type="doi">10.5455/aim.2017.25.182-186</pub-id>
          <pub-id pub-id-type="medline">29114111</pub-id>
          <pub-id pub-id-type="pii">AIM-25-182</pub-id>
          <pub-id pub-id-type="pmcid">PMC5639897</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Di</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Diagnosis of COPD based on a knowledge graph and integrated model</article-title>
          <source>IEEE Access</source>
          <year>2019</year>
          <volume>7</volume>
          <fpage>46004</fpage>
          <lpage>46013</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2019.2909069</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bahdanau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Neural machine translation by jointly learning to align and translate</article-title>
          <year>2015</year>
          <conf-name>3rd International Conference on Learning Representations</conf-name>
          <conf-date>May 7-9</conf-date>
          <conf-loc>San Diego, California</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1409.0473"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Document modeling with gated recurrent neural network for sentiment classification</article-title>
          <year>2015</year>
          <conf-name>2015 conference on empirical methods in natural language processing (EMNLP)</conf-name>
          <conf-date>September 17-21</conf-date>
          <conf-loc>Lisbon, Portugal</conf-loc>
          <fpage>1422</fpage>
          <lpage>1432</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D15-1167.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d15-1167</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dyer</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Hierarchical attention networks for document classification</article-title>
          <year>2016</year>
          <conf-name>15th conference of the North American chapter of the association for computational linguistics: human language technologies</conf-name>
          <conf-date>June 12-17</conf-date>
          <conf-loc>San Diego, California</conf-loc>
          <fpage>1480</fpage>
          <lpage>1489</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/N16-1174.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/n16-1174</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <year>2017</year>
          <conf-name>31th Conference on Neural Information Processing Systems (NIPS)</conf-name>
          <conf-date>December 4-9</conf-date>
          <conf-loc>Long Beach, California</conf-loc>
          <fpage>5998</fpage>
          <lpage>6008</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.5040/9781350101272.00000005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sundararajan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Henderson</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Perry</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Muggivan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Quan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ghali</surname>
              <given-names>WA</given-names>
            </name>
          </person-group>
          <article-title>New ICD-10 version of the Charlson comorbidity index predicted in-hospital mortality</article-title>
          <source>J Clin Epidemiol</source>
          <year>2004</year>
          <month>12</month>
          <volume>57</volume>
          <issue>12</issue>
          <fpage>1288</fpage>
          <lpage>1294</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jclinepi.2004.03.012</pub-id>
          <pub-id pub-id-type="medline">15617955</pub-id>
          <pub-id pub-id-type="pii">S0895-4356(04)00164-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Leveraging text skeleton for de-identification of electronic medical records</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2018</year>
          <month>03</month>
          <day>22</day>
          <volume>18</volume>
          <issue>Suppl 1</issue>
          <fpage>18</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-018-0598-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-018-0598-6</pub-id>
          <pub-id pub-id-type="medline">29589571</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-018-0598-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC5872383</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>Obstetrics and Gynecology. 9th edition</source>
          <year>2018</year>
          <publisher-loc>Beijing</publisher-loc>
          <publisher-name>People's Medical Publishing House</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>PKUSEG: a toolkit for multi-domain Chinese word segmentation</article-title>
          <source>ArXiv.</source>
          <access-date>2019-06-27</access-date>
          <comment>
             Preprint posted on Jun 27, 2019. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1906.11455">http://arxiv.org/abs/1906.11455</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C D</given-names>
            </name>
          </person-group>
          <article-title>Glove: global vectors for word representation</article-title>
          <year>2014</year>
          <conf-name>2014 conference on empirical methods in natural language processing (EMNLP)</conf-name>
          <conf-date>October 25-29</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1532</fpage>
          <lpage>1543</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/D14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Adam: a method for stochastic optimization</article-title>
          <year>2015</year>
          <conf-name>3rd International Conference on Learning Representations</conf-name>
          <conf-date>May 7-9</conf-date>
          <conf-loc>San Diego, California</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1412.6980"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>A review on multi-label learning algorithms</article-title>
          <source>IEEE Trans Knowl Data Eng</source>
          <year>2014</year>
          <month>8</month>
          <volume>26</volume>
          <issue>8</issue>
          <fpage>1819</fpage>
          <lpage>1837</lpage>
          <pub-id pub-id-type="doi">10.1109/TKDE.2013.39</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Read</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pfahringer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Holmes</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Classifier chains for multi-label classification</article-title>
          <year>2008</year>
          <conf-name>Joint European Conference on Machine Learning and Knowledge Discovery in Databases</conf-name>
          <conf-date>September 14-18</conf-date>
          <conf-loc>Ghent, Belgium</conf-loc>
          <fpage>254</fpage>
          <lpage>269</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-642-04174-7_17</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>ML-KNN: a lazy learning approach to multi-label learning</article-title>
          <source>Pattern Recognition</source>
          <year>2007</year>
          <month>7</month>
          <volume>40</volume>
          <issue>7</issue>
          <fpage>2038</fpage>
          <lpage>2048</lpage>
          <pub-id pub-id-type="doi">10.1016/j.patcog.2006.12.019</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Qiu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Recurrent neural network for text classification with multi-task learning</article-title>
          <year>2016</year>
          <conf-name>Twenty-Fifth International Joint Conference on Artificial Intelligence (IJCAI)</conf-name>
          <conf-date>July 9-15</conf-date>
          <conf-loc>New York</conf-loc>
          <fpage>2873</fpage>
          <lpage>2879</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ijcai.org/Proceedings/16/Papers/408.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/0004-3702(82)90046-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soleimani</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>DJ</given-names>
            </name>
          </person-group>
          <article-title>Semi-supervised multi-label topic models for document classification and sentence labeling</article-title>
          <year>2016</year>
          <conf-name>25th ACM international on conference on information and knowledge management (CIKM)</conf-name>
          <conf-date>October 24-26</conf-date>
          <conf-loc>Indianapolis</conf-loc>
          <fpage>105</fpage>
          <lpage>114</lpage>
          <pub-id pub-id-type="doi">10.1145/2983323.2983752</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
