<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i5e17637</article-id>
      <article-id pub-id-type="pmid">32364514</article-id>
      <article-id pub-id-type="doi">10.2196/17637</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Multi-Level Representation Learning for Chinese Medical Entity Recognition: Model Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Hao</surname>
            <given-names>Tianyong</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Song</surname>
            <given-names>Wei</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>Linfeng</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Zhichang</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>College of Computer Science and Engineering</institution>
            <institution>University of Northwest Normal</institution>
            <addr-line>967 Anning East Road</addr-line>
            <addr-line>Lanzhou, </addr-line>
            <country>China</country>
            <phone>86 13038769329</phone>
            <email>zzc@nwnu.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3306-8493</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Zhu</surname>
            <given-names>Lin</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7902-4819</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Peilin</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2597-6133</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>College of Computer Science and Engineering</institution>
        <institution>University of Northwest Normal</institution>
        <addr-line>Lanzhou</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Zhichang Zhang <email>zzc@nwnu.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>5</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>4</day>
        <month>5</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>5</issue>
      <elocation-id>e17637</elocation-id>
      <history>
        <date date-type="received">
          <day>30</day>
          <month>12</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>14</day>
          <month>2</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>24</day>
          <month>2</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>19</day>
          <month>3</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Zhichang Zhang, Lin Zhu, Peilin Yu. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 04.05.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2020/5/e17637" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Medical entity recognition is a key technology that supports the development of smart medicine. Existing methods on English medical entity recognition have undergone great development, but their progress in the Chinese language has been slow. Because of limitations due to the complexity of the Chinese language and annotated corpora, these methods are based on simple neural networks, which cannot effectively extract the deep semantic representations of electronic medical records (EMRs) and be used on the scarce medical corpora. We thus developed a new Chinese EMR (CEMR) dataset with six types of entities and proposed a multi-level representation learning model based on Bidirectional Encoder Representation from Transformers (BERT) for Chinese medical entity recognition.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to improve the performance of the language model by having it learn multi-level representation and recognize Chinese medical entities.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>In this paper, the pretraining language representation model was investigated; utilizing information not only from the final layer but from intermediate layers was found to affect the performance of the Chinese medical entity recognition task. Therefore, we proposed a multi-level representation learning model for entity recognition in Chinese EMRs. Specifically, we first used the BERT language model to extract semantic representations. Then, the multi-head attention mechanism was leveraged to automatically extract deeper semantic information from each layer. Finally, semantic representations from multi-level representation extraction were utilized as the final semantic context embedding for each token and we used softmax to predict the entity tags.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The best F1 score reached by the experiment was 82.11% when using the CEMR dataset, and the F1 score when using the CCKS (China Conference on Knowledge Graph and Semantic Computing) 2018 benchmark dataset further increased to 83.18%. Various comparative experiments showed that our proposed method outperforms methods from previous work and performs as a new state-of-the-art method.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The multi-level representation learning model is proposed as a method to perform the Chinese EMRs entity recognition task. Experiments on two clinical datasets demonstrate the usefulness of using the multi-head attention mechanism to extract multi-level representation as part of the language model.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>medical entity recognition</kwd>
        <kwd>multi-level representation learning</kwd>
        <kwd>Chinese</kwd>
        <kwd>natural language processing</kwd>
        <kwd>electronic medical records</kwd>
        <kwd>multi-head attention mechanism</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Electronic medical records (EMRs) comprise patients’ health information. Diagnostic accuracy can be improved by making full use of the available information in EMRs. Medical entity recognition (ER) is a fundamental task of medical natural language processing (NLP) and is usually treated as a sequence labeling problem [<xref ref-type="bibr" rid="ref1">1</xref>]. As shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, in which three predefined entity categories are disease, drug, and treatment, when using the BIO (beginning of the noun phrase, middle of the noun phrase, and not a noun phrase) labeling mode to tag Chinese EMRs, the candidate label set contains seven types: B-Dis (disease), I-Dis, B-Med (medicine), I-Med, B-Tre (treatment), I-Tre, and O.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>A tagging example of Chinese electronic medical records.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17637_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Generally, the methods of ER can be divided into two categories. The first category leverages rules and dictionaries to represent linguistic features and domain knowledge to identify clinical entities [<xref ref-type="bibr" rid="ref2">2</xref>]. The second category is based on traditional machine learning and neural networks [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]; this type of method greatly improves the performance of ER models but requires large-scale labeled data during model parameter training. In the medical field, creating annotation datasets is restricted by professional knowledge and legal regulations, so the lack of annotated corpora becomes one of the greatest technical challenges. At present, ER attracts a lot of attention from the field to improve the representation learning capability of current methods. Research studies have demonstrated that using embedding techniques can help solve the problem of missing supervised data in NLP tasks, including the factorization methods of Global Vectors (GloVe) [<xref ref-type="bibr" rid="ref9">9</xref>], the neural methods of word2vec [<xref ref-type="bibr" rid="ref10">10</xref>] and fastText [<xref ref-type="bibr" rid="ref11">11</xref>], and more recent dynamic methods that take into account the context, such as Embeddings from Language Models (ELMo) [<xref ref-type="bibr" rid="ref12">12</xref>] and OpenAI Generative Pre-trained Transformer (GPT) [<xref ref-type="bibr" rid="ref13">13</xref>]. Those embedding technologies can capture the context of semantics in unsupervised data and generate different vector representations of the same word in different contextual situations.</p>
        <p>Among them, Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref14">14</xref>] integrates many top ideas of language models and gives a particularly prominent performance. Transform-block is a feature extractor and learns different types of abstract granularity information. Multi-layer information is iterated layer by layer to generate embedding representation. In the actual training process, most downstream tasks take BERT's last embedding vector as the input of the model. However, studies found that different NLP tasks have different characteristics of requirements. Therefore, combining task features into the language model can reduce the loss of extracted information by the feature extractor and improve the utilization of language models. For example, Peters et al [<xref ref-type="bibr" rid="ref12">12</xref>] explicitly showed that the lower layer fits into the local semantic relationships, the higher layer is suitable for longer-range relationships, and the final layer specializes in the language model. Peters et al [<xref ref-type="bibr" rid="ref15">15</xref>] also showed that combining all semantic internal states models, by using a weighted-sum method to represent the vector of a word, can enrich the characteristics of the word in learning deep contextualized embedding representations. Because the Chinese ER task focuses on word granularity information, this is a straightforward way to use the information extracted from the low-layer representation.</p>
        <p>In this work, we tackle representation using the BERT language model. Our objective is to extract each layer of semantic information using feature extractors. We constructed a multi-level representation learning model for the optimal integration of information. Our contributions can be summarized as follows:</p>
        <list list-type="order">
          <list-item>
            <p>We manually annotated a new Chinese EMR (CEMR) corpus for ER tasks. Moreover, we propose a multi-level representation learning model to mine hidden representation.</p>
          </list-item>
          <list-item>
            <p>The proposed model takes advantage of the multi-head attention mechanism to integrate more suitable information from each layer and can perform as a state-of-the-art method on two clinical text datasets.</p>
          </list-item>
          <list-item>
            <p>The best F1 score achieved by the experiment was 82.11% on the CEMR corpus and significant improvement on the CCKS (China Conference on Knowledge Graph and Semantic Computing) 2018 benchmark dataset was attained.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Chinese Electronic Medical Record Dataset: A Newly Constructed Corpus</title>
        <p>Large labeled datasets are not always readily accessible. To facilitate the research on the ER task of the Chinese EMRs and future work in related topics, we constructed a new manually annotated CEMR dataset. The normalization of the labeling process refers to a large number of annotation guidelines [<xref ref-type="bibr" rid="ref16">16</xref>]. All EMRs came from Third-Class A-Level hospitals in Gansu Province, China, which contained 80,000 EMRs across 14 departments. Manual labeling of 4000 medical records provided the data for ER experiments. <xref ref-type="table" rid="table1">Table 1</xref> shows the data distribution of the 14 hospital departments. The CEMR corpus contains six types of entities: disease (Dis), symptom (Sym), test, treatment (Tre), medicine (Med), and abnormal inspection result (Abn). The categories are defined as follows:</p>
        <list list-type="order">
          <list-item>
            <p>Disease: refers to a specific abnormal pathological condition. This abnormal life condition is caused by disorders of self-regulation, such as diabetes.</p>
          </list-item>
          <list-item>
            <p>Symptom: refers to subjective feelings described by patients or objective facts observed externally, such as abdominal distension.</p>
          </list-item>
          <list-item>
            <p>Test: includes examination procedures, items, and equipment to collect and confirm more information about the disease or symptom, such as electrocardiogram.</p>
          </list-item>
          <list-item>
            <p>Treatment: refers to a treatment program or intervention to treat diseases or relieve symptoms, such as neurotrophic treatment.</p>
          </list-item>
          <list-item>
            <p>Medicine: refers to a chemical substance used to prevent and treat diseases or to strengthen the body and improve mental state, such as insulin.</p>
          </list-item>
          <list-item>
            <p>Abnormal inspection result: refers to an abnormal change or inspection result observed by doctors or by examination equipment, such as a little sputum sound.</p>
          </list-item>
        </list>
        <p>Before labeling the data, private information was removed in the EMRs, such as patients’ names, addresses, and hospital IDs. In the process of labeling samples, the annotation tool is developed specifically for the ER task. Moreover, some strategies have been developed to create high-quality annotated data. For example, the annotation samples will be randomly checked at any time.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Electronic medical record (EMR) data distribution by department.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Department</td>
                <td>EMR count, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Neurosurgery</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Neurology</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Cardiology</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Gynecology and obstetrics</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Andrology</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Respiratory medicine</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Cardiovasology</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Hepatobiliary surgery</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Ophthalmology</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Orthopedics</td>
                <td>77 (1.93)</td>
              </tr>
              <tr valign="top">
                <td>Gynecology</td>
                <td>101 (2.53)</td>
              </tr>
              <tr valign="top">
                <td>Pediatrics</td>
                <td>232 (5.80)</td>
              </tr>
              <tr valign="top">
                <td>Internal medicine</td>
                <td>970 (24.25)</td>
              </tr>
              <tr valign="top">
                <td>Surgery</td>
                <td>1495 (37.38)</td>
              </tr>
              <tr valign="top">
                <td>Other</td>
                <td>432 (10.80)</td>
              </tr>
              <tr valign="top">
                <td>Total</td>
                <td>4000 (100)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>The goal of the ER task is to provide the model with an EMR and its semantic types, so that it can extract and classify all characters in the text. The proposed model consists of three stacked layers: the input layer, the feature extraction layer, and the output layer.</p>
        <p>As shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>, the model first used the BERT language model to extract the semantic representations. Then, the multi-head attention mechanism was leveraged to automatically extract deeper semantic information from each layer. Finally, the semantic information from the multi-level representation extraction was utilized as the final semantic context embedding for each token and was input into the softmax layer to predict the entity tag. The input sentence was denoted as C = (c<sub>1</sub>, c<sub>2</sub>, c<sub>3</sub>, ..., c<sub>n</sub>), where c<sub>n</sub> represented the <italic>n</italic>-th character in sentence C of the Chinese EMR. Correspondingly, the output sentence’s predicted tag sequence was denoted as Y = (y<sub>1</sub>, y<sub>2</sub>, y<sub>3</sub> ..., y<sub>n</sub>), where y<sub>n</sub> belonged to one of the sets: B-Dis, I-Dis, B-Sym, I-Sym, B-Test, I-Test, B-Tre, I-Tre, B-Med, I-Med, B-Abn, I-Abn, or O. In the following text, we introduce the BERT language model and describe the proposed multi-level representation learning model.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Multi-level representation learning for ER model. B-Sym: beginning of the noun phrase for the symptom entity; B-Test: beginning of the noun phrase for the test entity; C: input sentence; E: input embedding; I-Sym: middle of the noun phrase for the symptom entity; I-Test: middle of the noun phrase for the test entity; O: not a noun phrase; Trm: transform-block; y: output sentence’s predicted tag sequence.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17637_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Bidirectional Encoder Representations From Transformers</title>
        <p>BERT was designed to learn deep bidirectional representations by jointly conditioning both the left and right contexts in all layers. It was based on multi-layer bidirectional encoder transformers and could be used for different architectures. When given a character-level sequence C = (c<sub>1</sub>, c<sub>2</sub>, c<sub>3</sub>, ..., c<sub>n</sub>), BERT was formulated as follows:</p>
        <disp-formula><italic>h<sub>1</sub></italic> = <italic>E<sub>Token</sub></italic> + <italic>E<sub>Segment</sub></italic> + <italic>E<sub>Position</sub></italic> (1)</disp-formula>
        <disp-formula><italic>h<sub>l</sub></italic> = <italic>Trm</italic>(<italic>h<sub>l–1</sub></italic>) (2)</disp-formula>
        <disp-formula><italic>Y<sup>BERT</sup></italic> = <italic>Softmax</italic>(<italic>w<sub>O</sub>h<sub>L</sub></italic> + <italic>b<sub>O</sub></italic>) (3)</disp-formula>
        <p>where <italic>h</italic><sub>1</sub> represents input embedding for a sequence and is made up of <italic>E<sub>Token</sub></italic>, <italic>E<sub>Segment</sub></italic>, and <italic>E<sub>Position</sub></italic>, which mean token, segment, and position for a sentence, respectively. The BERT leverage transformer is the feature extractor. <italic>Trm</italic> is a transform-block that includes self-attention, the fully connected layers, and the output layer. The current <italic>l</italic> layer hidden state came from the upper <italic>l–1</italic> layer and <italic>L</italic> was the last layer. <italic>Y<sup>BERT</sup></italic> denotes the output layer that predicts the sequence labels. In the above equations, <italic>w<sub>O</sub></italic> denotes the function weight and <italic>b<sub>O</sub></italic> is the function bias. All parameters of the transform-block were trained in advance on a large document-level corpus using a masked language model and were fine-tuned by predicting task-specific labels with the output layer to maximize the log-probability of the correct label.</p>
      </sec>
      <sec>
        <title>Multi-Level Representation Learning for Entity Recognition</title>
        <p>The Multi-Level Representation Learning for ER model (Multi-Level ER) could automatically integrate deeper semantic information from all layers of the feature extractor for ER task. The proposed language model took advantage of the multi-head attention mechanism. Multi-head attention is a special type of attention that allowed the model to focus on different positions of subspace representation information and could learn more about the connections between internal elements. <xref rid="figure3" ref-type="fig">Figure 3</xref> shows the calculation process of the multi-head attention mechanism when calculating the weight of the transform-block output knowledge. The query (Q), key (K), and value (V) in the transform-block were calculated. The process of acquiring Q, K, and V could be written as follows:</p>
        <disp-formula><italic>H</italic> = <italic>Concat</italic>(<italic>h<sub>1</sub></italic>, <italic>h<sub>2</sub></italic>, <italic>h<sub>3</sub></italic>, ..., <italic>h<sub>L</sub></italic>) (4)</disp-formula>
        <disp-formula><italic>Q</italic> = <italic>w<sub>Q</sub>h<sub>L</sub></italic> + <italic>b<sub>L</sub></italic> (5)</disp-formula>
        <disp-formula><italic>K</italic> = <italic>w<sub>K</sub>H</italic> + <italic>b<sub>K</sub></italic> (6)</disp-formula>
        <disp-formula><italic>V</italic> = <italic>w<sub>V</sub>H</italic> + <italic>b<sub>V</sub></italic> (7)</disp-formula>
        <p>where <italic>h<sub>L</sub></italic> denotes the hidden state of the final layer of the transform-block. The parameters <italic>w<sub>Q</sub></italic>, <italic>w<sub>K</sub></italic>, and <italic>w<sub>V</sub></italic> are weight matrices. The parameters <italic>b<sub>L</sub></italic>, <italic>b<sub>K</sub></italic>, and <italic>b<sub>V</sub></italic> are bias matrices. The attention function is calculated as follows:</p>
        <disp-formula><italic>head<sub>i</sub></italic> = <italic>Softmax</italic>(<italic>Q<sub>L</sub>K<sup>T</sup></italic>/<italic>√d</italic>)<italic>V</italic> (8)</disp-formula>
        <p>where <italic>head<sub>i</sub></italic> means the <italic>i</italic>-th head. <italic>Q<sub>L</sub></italic> is the query key value of the last <italic>L</italic> layer. <italic>√d</italic> is used to control the order of magnitude of calculation results and <italic>d</italic> donates the dimension of the <italic>K</italic> vector. In this work, we used multi-head attention, as introduced in the following equation:</p>
        <disp-formula><italic>E</italic> = <italic>Concat</italic>(<italic>head<sub>1</sub></italic>, <italic>head<sub>2</sub></italic>, <italic>head<sub>3</sub></italic>, ..., <italic>head<sub>l</sub></italic>)<italic>w<sub>h</sub></italic> + <italic>b<sub>h</sub></italic> (9)</disp-formula>
        <p>where <italic>w<sub>h</sub></italic> is used to balance the head weight. For the final layer of the network, we pass the results into a fully connected layer with a softmax function, as follows:</p>
        <disp-formula><italic>Y<sup>Multi-Level ER</sup></italic> = <italic>Softmax</italic>(<italic>w<sub>O</sub>E</italic> + <italic>b<sub>O</sub></italic>) (10)</disp-formula>
        <p>where <italic>w<sub>O</sub></italic> is the output weight matrix and <italic>b<sub>O</sub></italic> is the bias of the output layer.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Multi-head attention mechanism. K: key; Q: query; V: value.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17637_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Experiment</title>
        <p>This model was supported by multiple sets of comparative experiments. Each group of experiments was repeated three times, and the result in the middle of the ranking was taken as the final result.</p>
        <sec>
          <title>Dataset and Evaluation Criteria</title>
          <p>We evaluated the proposed model on two datasets: the CCKS 2018 dataset and the CEMR dataset. The CCKS 2018 dataset was adopted from the Chinese EMR named ER task at the CCKS, which included 1000 admission records. In the experiment, 600 records were used as training data and the remaining were test data. Comparative experiments were made on the new CEMR corpus and contained 4000 documents. We further split the corpus set by 60%, 20%, and 20% as training, validation, and test sets, respectively. <xref ref-type="table" rid="table2">Table 2</xref> shows the distribution of documents in two datasets.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Components of the two datasets.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="220"/>
              <col width="160"/>
              <col width="220"/>
              <col width="250"/>
              <col width="150"/>
              <thead>
                <tr valign="top">
                  <td>Dataset</td>
                  <td colspan="4">Number of records per set</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Total</td>
                  <td>Training set</td>
                  <td>Validation set</td>
                  <td>Test set</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>CEMR<sup>a</sup> dataset</td>
                  <td>4000</td>
                  <td>2400</td>
                  <td>800</td>
                  <td>800</td>
                </tr>
                <tr valign="top">
                  <td>CCKS<sup>b</sup> 2018</td>
                  <td>1000</td>
                  <td>600</td>
                  <td>N/A<sup>c</sup></td>
                  <td>400</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>CEMR: Chinese electronic medical record.</p>
              </fn>
              <fn id="table2fn2">
                <p><sup>b</sup>CCKS: China Conference on Knowledge Graph and Semantic Computing.</p>
              </fn>
              <fn id="table2fn3">
                <p><sup>c</sup>Not applicable; because the comparison method does not divide the validation set on the CCKS dataset, we have kept this the same as the original experiment to make the comparison fair.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>To evaluate the performance of all prediction methods fairly, the results were validated by precision (P), recall (R), and F1 scores (F1) as measurements to evaluate the recognition effectiveness of the model; these were defined as follows:</p>
          <disp-formula>P = TP/(TP + TF) (11)</disp-formula>
          <disp-formula>R = TP/(TP + FN) (12)</disp-formula>
          <disp-formula>F1 = (2 × T × P)/(P + R) (13)</disp-formula>
          <p>An entity is annotated as correct when its category and boundary are fully labeled correctly. TP is the count of entity labels presenting the same labels as gold standard labels, FP is the count of recognized entities marked incorrectly in the results, and FN is the count of the gold standard entities that are not present in the results of the indicator.</p>
        </sec>
        <sec>
          <title>Parameter Setup</title>
          <p>Hyperparameter configuration was adjusted according to the performance on the described validation sets. We used a publicly available pretraining language representation model, namely the BERT<sub>BASE-Chinese-uncased</sub>. This model has 12 layers, 768 hidden layers, and 12 heads. The multi-head attention mechanism was utilized to automatically integrate all layers of information. By comparing experimental results with different head numbers, we had set the head number to 12. We fine-tuned the model over 10 epochs with a batch size of 32. The maximum training sentence length was 64. The model was trained with the AdamW optimizer with a learning rate of le-5 and we applied a dropout rate of 0.3.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview</title>
        <p>We summarized the overall performance by computing the F1 score; the results are illustrated in <xref ref-type="table" rid="table3">Table 3</xref>. On the CEMR dataset, we compared the multi-level ER learning model with previous classic methods, including conditional random field (CRF), convolutional neural network (CNN)+bidirectional long short-term memory (BiLSTM)+CRF, lattice long short-term memory (LSTM), and BERT. We found that the proposed model is better than state-of-the-art baseline methods, with F1 scores of 0.94% to 4.9%. Our multi-level ER learning model had improved by 1.48% in its <italic>P</italic> value, 0.47% in its R value, and 0.94% in its F1 score compared to the BERT model. The result also demonstrated that pretraining the multi-level ER learning language model was highly effective for task-specific Chinese EMR ER.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Comparison of method performance on the Chinese electronic medical record (CEMR) dataset.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="550"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td><italic>P</italic> value (%)</td>
                <td>R value (%)</td>
                <td>F1 score (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Conditional random field (CRF)</td>
                <td>88.57</td>
                <td>68.43</td>
                <td>77.21</td>
              </tr>
              <tr valign="top">
                <td>CNN<sup>a</sup>+BiLSTM<sup>b</sup>+CRF</td>
                <td>81.51</td>
                <td>76.92</td>
                <td>79.15</td>
              </tr>
              <tr valign="top">
                <td>Lattice long short-term memory (LSTM)</td>
                <td>88.60</td>
                <td>74.48</td>
                <td>80.93</td>
              </tr>
              <tr valign="top">
                <td>Bidirectional Encoder Representations from Transformers (BERT)</td>
                <td>83.73</td>
                <td>78.76</td>
                <td>81.17</td>
              </tr>
              <tr valign="top">
                <td>Multi-level representation learning for entity recognition (multi-level ER)</td>
                <td>85.21</td>
                <td>79.23</td>
                <td>82.11</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>BiLSTM: bidirectional long short-term memory.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>We also applied our model to the widely used benchmark CCKS 2018 dataset and used the same data split to compare it. Huang et al [<xref ref-type="bibr" rid="ref17">17</xref>] proposed a BiLSTM-CRF model for sequence tagging and Cai et al [<xref ref-type="bibr" rid="ref18">18</xref>] was based on the self-matching attention mechanism (SM) and proposed an SM-LSTM-CRF model design for the named ER task. The results are shown in <xref ref-type="table" rid="table4">Table 4</xref>. Under the condition of not needing any external resources, the proposed multi-level ER learning model already outperformed the previous SM-LSTM-CRF model by 3.1% on the F1 score.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Comparison of method performance on the China Conference on Knowledge Graph and Semantic Computing 2018 dataset.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="450"/>
            <col width="190"/>
            <col width="180"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td><italic>P</italic> value (%)</td>
                <td>R value (%)</td>
                <td>F1 score (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>BiLSTM<sup>a</sup>-CRF<sup>b</sup> [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
                <td>65.68</td>
                <td>69.04</td>
                <td>67.32</td>
              </tr>
              <tr valign="top">
                <td>SM<sup>c</sup>-LSTM-CRF [<xref ref-type="bibr" rid="ref18">18</xref>]</td>
                <td>80.54</td>
                <td>79.61</td>
                <td>80.08</td>
              </tr>
              <tr valign="top">
                <td>Multi-level representation learning for entity recognition (multi-level ER)</td>
                <td>83.90</td>
                <td>82.47</td>
                <td>83.18</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>BiLSTM: bidirectional long short-term memory.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>CRF: conditional random field.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>SM: self-matching attention mechanism.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>The Effect of Assembling Methods</title>
        <p>We compared the effects of different assembling methods on model performance to verify the ability of the multi-head attention mechanism to combine hierarchical information. As listed in <xref ref-type="table" rid="table5">Table 5</xref>, we first applied concatenation that directed the horizontal concatenated tensors; the F1 score was 81.51%. We then adopted the sum average method to get an F1 score of 81.11%. We finally adopted the multi-head attention method, given that it had the best overall performance compared to several other methods we evaluated. The results showed that integrated hidden information can acquire more suitable representation; the multi-head attention mechanism can be leveraged to automatically extract deeper semantic information from each layer, which is the most effective assembling method.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>The effect of assembling methods.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="460"/>
            <col width="180"/>
            <col width="170"/>
            <col width="190"/>
            <thead>
              <tr valign="top">
                <td>Assembling method</td>
                <td><italic>P</italic> value (%)</td>
                <td>R value (%)</td>
                <td>F1 score (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Concatenation</td>
                <td>84.22</td>
                <td>78.97</td>
                <td>81.51</td>
              </tr>
              <tr valign="top">
                <td>Sum average</td>
                <td>83.27</td>
                <td>79.06</td>
                <td>81.11</td>
              </tr>
              <tr valign="top">
                <td>Multi-head attention mechanism</td>
                <td>85.21</td>
                <td>79.23</td>
                <td>82.11</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>The Effect of Extraction Layer Numbers</title>
        <p>To examine the impact of extraction layer numbers on model performance, we performed comparative experiments using various extraction layer numbers; the results are shown in <xref ref-type="table" rid="table6">Table 6</xref>. It was observed that the performance of all layers was superior to that of the other numbers of layers, which introduced multi-level ER into the language model and enhanced model performance. By and large, the tendency was that performance improved as the number of extracting layers increased. However, we also discovered that extracting the last four layers gave higher F1 scores than extracting the last six or two layers. The analysis showed that the results were closely related to the specific dataset. Of course, as the number of layers increased, parameters required by the neural network also increased significantly. Therefore, when there was a high demand for speed on the model, we could select a structure that included the last four layers to optimize time efficiency.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>The effect of extracted layer numbers.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="420"/>
            <col width="190"/>
            <col width="200"/>
            <col width="190"/>
            <thead>
              <tr valign="top">
                <td>Extraction layer number</td>
                <td><italic>P</italic> value (%)</td>
                <td>R value (%)</td>
                <td>F1 score (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Total layers</td>
                <td>85.21</td>
                <td>79.23</td>
                <td>82.11</td>
              </tr>
              <tr valign="top">
                <td>The last six layers</td>
                <td>85.15</td>
                <td>78.65</td>
                <td>81.77</td>
              </tr>
              <tr valign="top">
                <td>The last four layers</td>
                <td>85.50</td>
                <td>78.68</td>
                <td>81.95</td>
              </tr>
              <tr valign="top">
                <td>The last two layers</td>
                <td>84.51</td>
                <td>78.68</td>
                <td>81.49</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>The Effect of Dataset Size</title>
        <p><xref rid="figure4" ref-type="fig">Figure 4</xref> shows the impact of the dataset size on model performance. Horizontal coordinates represent the size of the training dataset and vertical coordinates indicate the F1 scores. During the experiment, we used different sized corpora to train the CNN-LSTM-CRF, BERT, and multi-level ER models. The figure shows that as the training dataset increased, the performance of the models also improved. In reality, we had a limited number of datasets, and models were unlikely to reach saturation. Therefore, the impact of dataset size on performance was particularly critical. We found that the CNN-LSTM-CRF model performance was sharply affected by the size of the dataset when the training set increased from 70% to 100%. Inversely, the BERT model and the multi-level ER model were less influenced by the training dataset size, and our proposed multi-level RE model outperformed the BERT model.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>The effect of dataset size. BERT: Bidirectional Encoder Representations from Transformers; CNN: convolutional neural network; CRF: conditional random field; LSTM: long short-term memory; Multi-Level ER: multi-level representation learning for entity recognition.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17637_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Case Studies</title>
        <p>To show that our model was able to solve the challenge of integrating representation information, three case studies comparing the multi-level ER model with the BERT model are shown in <xref rid="figure5" ref-type="fig">Figure 5</xref>. Several obvious trends emerged from the comparative experiments. Most generally, when the word “disease” is included within the medical history, it is mistaken for a disease. For example, case study 1 in <xref rid="figure5" ref-type="fig">Figure 5</xref> shows that “history of mental disease” is recognized as a disease. Case study 2 in <xref rid="figure5" ref-type="fig">Figure 5</xref> shows that when “anal” and “external genitals” appear together before the examination, the system will only identify the adjacent area to be tested. The descriptions with the obvious word “treatment” are identified as a treatment in case study 3 of <xref rid="figure5" ref-type="fig">Figure 5</xref>.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Case studies comparing the multi-level representation learning for entity recognition (Multi-Level ER) model with the Bidirectional Encoder Representations from Transformers (BERT) model.</p>
          </caption>
          <graphic xlink:href="medinform_v8i5e17637_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>We found that the BERT model’s embedding technology improves the performance of the ER model in Chinese EMRs; however, using information from only the last layer of the feature extractor in the language model did not achieve the best experimental results. Our proposed multi-level ER model combines the information from each layer of the feature extractor and selects the most suitable, long-term, syntactic, relationship information for the ER task, which greatly improves the performance of the model.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>ER tasks attract a large amount of scholastic attention. The development of deep learning methods has resulted in a breakthrough regarding these tasks. CNN and recurrent neural network (RNN) models have emerged one after another; the attention mechanism and transfer learning were applied to the model. Wu et al [<xref ref-type="bibr" rid="ref19">19</xref>] utilized a CNN model to generate features represented by several global hidden nodes. Both local features and global features were then fed into a standard affine network to recognize named entities in clinical text. Ju et al [<xref ref-type="bibr" rid="ref20">20</xref>] used an LSTM neural model to identify nested entities by dynamically stacking flat, named ER layers. Rei et al [<xref ref-type="bibr" rid="ref21">21</xref>] applied the attention mechanism to dynamically decide how much information to use from a character-level or word-level component in an end-to-end model. Lee et al [<xref ref-type="bibr" rid="ref22">22</xref>] applied transfer learning in named ER by training a model on source task and using the trained model on the target task for fine-tuning. Peng et al [<xref ref-type="bibr" rid="ref23">23</xref>] proposed a method where the prediction model was based on BiLSTM, which was taken as the source task of transfer learning. For the ER task in clinical notes, Bharadwaj et al’s [<xref ref-type="bibr" rid="ref24">24</xref>] work centered on effectively adapting these neural architectures toward low-resource settings using parameter transfer methods.</p>
        <p>Language models can capture the syntactic and semantic information of words from a large number of unlabeled texts, which alleviates the problem of an insufficiently annotated corpus in special domains. Peters et al [<xref ref-type="bibr" rid="ref12">12</xref>] used a language model to obtain a deep contextualized word pretraining representation called ELMo and improved the accuracy of six NLP tasks. Radford et al [<xref ref-type="bibr" rid="ref13">13</xref>] proposed the GPT for language understanding tasks. For text classification and sequence labeling tasks, the transfer ability is better. Devlin et al [<xref ref-type="bibr" rid="ref14">14</xref>] proposed the pretraining of deep bidirectional transformers for language understanding (ie, BERT); it captured true directional context information, sweeping 11 NLP tasks through pretraining and fine-tuning.</p>
        <p>Our motivation is to seize the optimal information from each layer of a feature extractor to suit a given task. Takase et al [<xref ref-type="bibr" rid="ref25">25</xref>] employed intermediate layer representation, including input embedding, to calculate the probability distributions to solve a ranking problem in language generation tasks. Kaneko et al [<xref ref-type="bibr" rid="ref26">26</xref>] demonstrated that learning suitable representation came from different layers in grammatical error detection tasks. Therefore, we tracked their work and found the issue in the ER task in Chinese EMRs.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We propose a novel, multi-level, representation learning model for ER of Chinese EMRs-the multi-level ER model. We compared our model with state-of-the-art models and observed comparable performance without any external syntactic tools. The results showed that the use of the multi-head attention mechanism can effectively integrate deep semantic information from each layer of the feature extractor. In the future, we plan to apply multi-level ER to other language representation models in order to obtain even greater improvement.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">Abn</term>
          <def>
            <p>abnormal inspection result</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BiLSTM</term>
          <def>
            <p>bidirectional long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">BIO</term>
          <def>
            <p>beginning of the noun phrase, middle of the noun phrase, and not a noun phrase</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CCKS</term>
          <def>
            <p>China Conference on Knowledge Graph and Semantic Computing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">CEMR</term>
          <def>
            <p>Chinese electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">CRF</term>
          <def>
            <p>conditional random field</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">Dis</term>
          <def>
            <p>disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">ELMo</term>
          <def>
            <p>Embeddings from Language Models</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">ER</term>
          <def>
            <p>entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">F1</term>
          <def>
            <p>F1 score</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">GloVe</term>
          <def>
            <p>Global Vectors</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">GPT</term>
          <def>
            <p>Generative Pretraining Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">K</term>
          <def>
            <p>key</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">Med</term>
          <def>
            <p>medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">multi-level ER</term>
          <def>
            <p>multi-level representation learning for entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">P</term>
          <def>
            <p>precision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb22">Q</term>
          <def>
            <p>query</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb23">R</term>
          <def>
            <p>recall</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb24">RNN</term>
          <def>
            <p>recurrent neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb25">SM</term>
          <def>
            <p>self-matching attention mechanism</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb26">Sym</term>
          <def>
            <p>symptom</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb27">Tre</term>
          <def>
            <p>treatment</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb28">V</term>
          <def>
            <p>value</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work is supported by the National Natural Science Foundation of China (No. 61762081, No. 61662067, and No. 61662068) and the Key Research and Development Project of Gansu Province (No. 17YF1GA016). The datasets used and analyzed during this study are available from the first author upon reasonable request. The CCKS 2018 dataset that supports the findings of this study were adopted from the Chinese EMR named ER task from the CCKS 2018, but restrictions apply to the availability of these data, which were used under license for this study and are not publicly available.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Si</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Improve neural entity recognition via multi-task data selection and constrained decoding</article-title>
          <source>Proceedings of the 2018 Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2018)</source>
          <year>2018</year>
          <conf-name>2018 Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2018)</conf-name>
          <conf-date>June 1-6, 2018</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <fpage>346</fpage>
          <lpage>351</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/N18-2056.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/N18-2056</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sanger</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Combining contextualized embeddings and prior knowledge for clinical named entity recognition: Evaluation study</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>11</month>
          <day>13</day>
          <volume>7</volume>
          <issue>4</issue>
          <fpage>e14850</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/4/e14850/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/14850</pub-id>
          <pub-id pub-id-type="medline">31719024</pub-id>
          <pub-id pub-id-type="pii">v7i4e14850</pub-id>
          <pub-id pub-id-type="pmcid">PMC6913757</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lample</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ballesteros</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Subramanian</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Neural architectures for named entity recognition</article-title>
          <source>Proceedings of the 2016 North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2016)</source>
          <year>2016</year>
          <conf-name>2016 North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2016)</conf-name>
          <conf-date>June 12-17, 2016</conf-date>
          <conf-loc>San Diego, CA</conf-loc>
          <fpage>260</fpage>
          <lpage>270</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/N16-1030.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/N16-1030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Hovy</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>End-to-end sequence labeling via bi-directional LSTM-CNNs-CRF</article-title>
          <source>Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2016</year>
          <conf-name>54th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>August 7-12, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>1064</fpage>
          <lpage>1074</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/P16-1101.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/P16-1101</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Salakhutdinov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>William</surname>
              <given-names>WC</given-names>
            </name>
          </person-group>
          <article-title>Transfer learning for sequence tagging with hierarchical recurrent networks</article-title>
          <source>Proceedings of the 5th International Conference on Learning Representations (ICLR 2017)</source>
          <year>2017</year>
          <conf-name>5th International Conference on Learning Representations (ICLR 2017)</conf-name>
          <conf-date>April 24-26, 2017</conf-date>
          <conf-loc>Toulon, France</conf-loc>
          <fpage>1</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1703.06345.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>End-to-end neural coreference resolution</article-title>
          <source>Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2017</year>
          <conf-name>2017 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>September 7-11, 2017</conf-date>
          <conf-loc>Copenhagen, Denmark</conf-loc>
          <fpage>188</fpage>
          <lpage>197</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D17-1018.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/D17-1018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Qiu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Adversarial multi-criteria learning for Chinese word segmentation</article-title>
          <source>Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2017</year>
          <conf-name>55th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>July 30-August 4, 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>1193</fpage>
          <lpage>1203</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/P17-1110.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/P17-1110</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Boukkouri</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ferret</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Lavergne</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zweigenbaum</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Embedding strategies for specialized domains: Application to clinical entity recognition</article-title>
          <source>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop</source>
          <year>2019</year>
          <conf-name>57th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop</conf-name>
          <conf-date>July 28-August 2, 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <fpage>295</fpage>
          <lpage>301</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/P19-2041.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/P19-2041</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>GloVe: Global Vectors for word representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>
          <year>2014</year>
          <conf-name>2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>October 25-29, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1532</fpage>
          <lpage>1543</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D14-1162.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Efficient estimation of word representations in vector space</article-title>
          <source>Proceedings of the International Conference on Learning Representations 2013</source>
          <year>2013</year>
          <conf-name>International Conference on Learning Representations 2013</conf-name>
          <conf-date>May 2-4, 2013</conf-date>
          <conf-loc>Scottsdale, Arizona</conf-loc>
          <fpage>1</fpage>
          <lpage>12</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1301.3781.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Enriching word vectors with subword information</article-title>
          <source>Trans Assoc Comput Linguist</source>
          <year>2017</year>
          <month>12</month>
          <volume>5</volume>
          <fpage>135</fpage>
          <lpage>146</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/Q17-1010.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yih</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Dissecting contextual word embeddings: Architecture and representation</article-title>
          <source>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2018</year>
          <conf-name>2018 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>October 31-November 4, 2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
          <fpage>1499</fpage>
          <lpage>1509</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D18-1179.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/D18-1179</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Narasimhan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Salimans</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <source>OpenAI preprint</source>
          <year>2018</year>
          <access-date>2020-04-12</access-date>
          <comment>Improving language understanding by generative pre-training<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf">https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <source>arXiv</source>
          <year>2018</year>
          <access-date>2019-10-18</access-date>
          <comment>Bert: Pre-training of deep bidirectional transformers for language understanding<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1810.04805.pdf">https://arxiv.org/pdf/1810.04805.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Deep contextualized word representations</article-title>
          <source>Proceedings of the 2018 Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2018)</source>
          <year>2018</year>
          <conf-name>2018 Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2018)</conf-name>
          <conf-date>June 1-6, 20188</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <fpage>2227</fpage>
          <lpage>2237</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/N18-1202.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1202</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stubbs</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>Ö</given-names>
            </name>
          </person-group>
          <article-title>Annotating risk factors for heart disease in clinical narratives for diabetic patients</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>12</month>
          <volume>58 Suppl</volume>
          <fpage>S78</fpage>
          <lpage>S91</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00089-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.05.009</pub-id>
          <pub-id pub-id-type="medline">26004790</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00089-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC4978180</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <source>arXiv</source>
          <year>2015</year>
          <access-date>2019-10-18</access-date>
          <comment>Bidirectional LSTM-CRF models for sequence tagging<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1508.01991v1.pdf">https://arxiv.org/pdf/1508.01991v1.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A deep learning model incorporating part of speech and self-matching attention for named entity recognition of Chinese electronic medical records</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2019</year>
          <month>04</month>
          <day>09</day>
          <volume>19</volume>
          <issue>Suppl 2</issue>
          <fpage>65</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-019-0762-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-019-0762-7</pub-id>
          <pub-id pub-id-type="medline">30961622</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-019-0762-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6454585</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Named entity recognition in Chinese clinical text using deep neural network</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2015</year>
          <volume>216</volume>
          <fpage>624</fpage>
          <lpage>628</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26262126"/>
          </comment>
          <pub-id pub-id-type="medline">26262126</pub-id>
          <pub-id pub-id-type="pmcid">PMC4624324</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ju</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Miwa</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ananiadou</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A neural layered model for nested named entity recognition</article-title>
          <source>Proceedings of the 2018 Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2018)</source>
          <year>2018</year>
          <conf-name>2018 Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2018)</conf-name>
          <conf-date>June 1-6, 2018</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <fpage>1446</fpage>
          <lpage>1459</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/N18-1131.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/N18-1131</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rei</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Crichton</surname>
              <given-names>GKO</given-names>
            </name>
            <name name-style="western">
              <surname>Pyysalo</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Attending to characters in neural sequence labeling models</article-title>
          <source>Proceedings of the 26th International Conference on Computational Linguistics (COLING 2016): Technical Papers</source>
          <year>2016</year>
          <conf-name>26th International Conference on Computational Linguistics (COLING 2016): Technical Papers</conf-name>
          <conf-date>December 11-17, 2016</conf-date>
          <conf-loc>Osaka, Japan</conf-loc>
          <fpage>309</fpage>
          <lpage>318</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/C16-1030.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>JY</given-names>
            </name>
            <name name-style="western">
              <surname>Dernoncourt</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <source>arXiv</source>
          <year>2017</year>
          <access-date>2019-10-18</access-date>
          <comment>Transfer learning for named-entity recognition with neural networks<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1705.06273.pdf">https://arxiv.org/pdf/1705.06273.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>TL-NER: A transfer learning model for Chinese named entity recognition</article-title>
          <source>Inf Syst Front</source>
          <year>2019</year>
          <month>6</month>
          <day>4</day>
          <fpage>1</fpage>
          <pub-id pub-id-type="doi">10.1007/s10796-019-09932-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bharadwaj</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mortensen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dyer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Carbonell</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Phonologically aware neural model for named entity recognition in low resource transfer settings</article-title>
          <source>Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2016</year>
          <conf-name>2016 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>November 1-5, 2016</conf-date>
          <conf-loc>Austin, TX</conf-loc>
          <fpage>1462</fpage>
          <lpage>1472</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D16-1153.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/D16-1153</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takase</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Suzuki</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nagata</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Direct output connection for a high-rank language model</article-title>
          <source>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2018</year>
          <conf-name>2018 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>October 31-November 4, 2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
          <fpage>4599</fpage>
          <lpage>4609</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D18-1489.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/D18-1489</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaneko</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Komachi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>arXiv</source>
          <year>2019</year>
          <access-date>2019-10-18</access-date>
          <comment>Multi-head multi-layer attention to deep language representations for grammatical error detection<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1904.07334.pdf">https://arxiv.org/pdf/1904.07334.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
