<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i4e17787</article-id>
      <article-id pub-id-type="pmid">32347806</article-id>
      <article-id pub-id-type="doi">10.2196/17787</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Modified Bidirectional Encoder Representations From Transformers Extractive Summarization Model for Hospital Information Systems Based on Character-Level Tokens (AlphaBERT): Development and Performance Evaluation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mayer</surname>
            <given-names>Gwendolyn</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Muto</surname>
            <given-names>Tomoyasu </given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ma</surname>
            <given-names>Shuoxin</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Rau</surname>
            <given-names>Hsiao-Hsien</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Yen-Pin</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Graduate Institute of Biomedical Electronics and Bioinformatics</institution>
            <institution>National Taiwan University</institution>
            <addr-line>Room 410, Barry Lam Hall</addr-line>
            <addr-line>No 1, Sec 4, Roosevelt Road</addr-line>
            <addr-line>Taipei City, </addr-line>
            <country>Taiwan</country>
            <phone>886 2 3366 3754</phone>
            <email>f06945029@g.ntu.edu.tw</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2473-0847</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Yi-Ying</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8947-2730</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Jr-Jiun</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2650-5579</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Chien-Hua</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2981-4537</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Lai</surname>
            <given-names>Feipei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7147-8122</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Graduate Institute of Biomedical Electronics and Bioinformatics</institution>
        <institution>National Taiwan University</institution>
        <addr-line>Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Emergency Medicine</institution>
        <institution>National Taiwan University Hospital Chu-Tung Branch</institution>
        <addr-line>Hsinchu County</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Emergency Medicine</institution>
        <institution>National Taiwan University Hospital</institution>
        <addr-line>Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Emergency Medicine</institution>
        <institution>College of Medicine</institution>
        <institution>National Taiwan University</institution>
        <addr-line>Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Computer Science &#38; Information Engineering</institution>
        <institution>National Taiwan University</institution>
        <addr-line>Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Department of Electrical Engineering</institution>
        <institution>National Taiwan University</institution>
        <addr-line>Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yen-Pin Chen <email>f06945029@g.ntu.edu.tw</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>4</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>29</day>
        <month>4</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>4</issue>
      <elocation-id>e17787</elocation-id>
      <history>
        <date date-type="received">
          <day>13</day>
          <month>1</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>6</day>
          <month>2</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>5</day>
          <month>3</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>10</day>
          <month>4</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Yen-Pin Chen, Yi-Ying Chen, Jr-Jiun Lin, Chien-Hua Huang, Feipei Lai. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 29.04.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2020/4/e17787/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Doctors must care for many patients simultaneously, and it is time-consuming to find and examine all patients’ medical histories. Discharge diagnoses provide hospital staff with sufficient information to enable handling multiple patients; however, the excessive amount of words in the diagnostic sentences poses problems. Deep learning may be an effective solution to overcome this problem, but the use of such a heavy model may also add another obstacle to systems with limited computing resources.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We aimed to build a diagnoses-extractive summarization model for hospital information systems and provide a service that can be operated even with limited computing resources.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used a Bidirectional Encoder Representations from Transformers (BERT)-based structure with a two-stage training method based on 258,050 discharge diagnoses obtained from the National Taiwan University Hospital Integrated Medical Database, and the highlighted extractive summaries written by experienced doctors were labeled. The model size was reduced using a character-level token, the number of parameters was decreased from 108,523,714 to 963,496, and the model was pretrained using random mask characters in the discharge diagnoses and International Statistical Classification of Diseases and Related Health Problems sets. We then fine-tuned the model using summary labels and cleaned up the prediction results by averaging all probabilities for entire words to prevent character level–induced fragment words. Model performance was evaluated against existing models BERT, BioBERT, and Long Short-Term Memory (LSTM) using the Recall-Oriented Understudy for Gisting Evaluation (ROUGE) L score, and a questionnaire website was built to collect feedback from more doctors for each summary proposal.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The area under the receiver operating characteristic curve values of the summary proposals were 0.928, 0.941, 0.899, and 0.947 for BERT, BioBERT, LSTM, and the proposed model (AlphaBERT), respectively. The ROUGE-L scores were 0.697, 0.711, 0.648, and 0.693 for BERT, BioBERT, LSTM, and AlphaBERT, respectively. The mean (SD) critique scores from doctors were 2.232 (0.832), 2.134 (0.877), 2.207 (0.844), 1.927 (0.910), and 2.126 (0.874) for reference-by-doctor labels, BERT, BioBERT, LSTM, and AlphaBERT, respectively. Based on the paired t test, there was a statistically significant difference in LSTM compared to the reference (<italic>P</italic>&#60;.001), BERT (<italic>P</italic>=.001), BioBERT (<italic>P</italic>&#60;.001), and AlphaBERT (<italic>P</italic>=.002), but not in the other models.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Use of character-level tokens in a BERT model can greatly decrease the model size without significantly reducing performance for diagnoses summarization. A well-developed deep-learning model will enhance doctors’ abilities to manage patients and promote medical studies by providing the capability to use extensive unstructured free-text notes.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>transformer</kwd>
        <kwd>BERT</kwd>
        <kwd>deep learning</kwd>
        <kwd>emergency medicine</kwd>
        <kwd>automatic summarization</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Medical centers are the last line of defense for public health and are responsible for educating medical talent. The number of patients in the emergency department of such medical centers is particularly large, and these patients tend to have more severe conditions than those admitted to hospital at a lower tier. For staff, the emergency department can be an overloaded work environment [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. At the beginning of the shift, a doctor must perform primary care for more than 30 patients who remain in the emergency department from less than 1 hour to more than 3 days, while simultaneously treating new arrivals from triage. The conditions of patients in the emergency department also tend to change rapidly, and the staff must be able to handle these patients under time constraints. The International Statistical Classification of Diseases and Related Health Problems (ICD) codes [<xref ref-type="bibr" rid="ref3">3</xref>] and recent discharge diagnoses can help staff rapidly determine baseline conditions. However, in a medical center, patients may have multiple underlying diseases and several comorbidities that were previously recorded as ICD codes and discharge diagnoses in electronic health records (EHRs). Because ICD codes only reflect the disease and not the associated treatments, this lack of information limits the ability of medical staff to consider information related to a previous hospital visit. Occasionally, ICD codes are selected imprecisely and do not adequately represent the condition of the patient. Therefore, discharge diagnoses are required for staff to become familiar with a patient’s condition. However, the number of words describing these details in a diagnostic sentence can vary widely. Consequently, the attending physician in the emergency department may have to read as many as 1500 words to cover the medical history of all patients under their charge. To resolve this challenge, the purpose of this study was to establish a diagnostic summary system to help hospital staff members check information on all patients more quickly.</p>
      </sec>
      <sec>
        <title>Related Works</title>
        <p>There are several available methods to accomplish a text summarization task, ranging from traditional natural language processing (NLP) to deep-learning language models [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. The goals of previous text summarization studies in the medical field [<xref ref-type="bibr" rid="ref5">5</xref>] included finding information related to patient care in the medical literature [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>], identifying drug information [<xref ref-type="bibr" rid="ref14">14</xref>], determining medical article topic classifications [<xref ref-type="bibr" rid="ref15">15</xref>], and summarizing medical articles [<xref ref-type="bibr" rid="ref16">16</xref>]. In the majority of cases, data sources for the automatic summarization task were medical articles [<xref ref-type="bibr" rid="ref16">16</xref>] such as PubMed articles [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. In recent years, EHRs have been widely adopted in several hospitals and clinics, and additional data sources such as the Medical Information Mart for Intensive Care III [<xref ref-type="bibr" rid="ref17">17</xref>] dataset are available online for free and promote medical progress. Based on medical record research, the monitoring of several disease indicators, clinical trial recruitments, and clinical decision making, several clinical summarization systems based on EHRs have been studied [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. However, no studies have addressed the issue of a diagnostic summary system to help hospital staff access information on all patients in their care more quickly.</p>
        <p>Although EHRs provide useful information, the majority of this information is recorded as free text, making it challenging to analyze along with other structured data [<xref ref-type="bibr" rid="ref4">4</xref>]. In recent years, NLP and deep-learning approaches have flourished, furnishing health care providers with a new field to promote human health. Several excellent language models are now available to help machines analyze free text. One such model is Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref21">21</xref>], which is an extension of Transformer [<xref ref-type="bibr" rid="ref22">22</xref>], and received the highest score for several NLP tasks [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        <p>Transformer is a state-of-the-art model, which was released to translate and improve the efficiency of Long Short-Term Memory (LSTM) [<xref ref-type="bibr" rid="ref25">25</xref>]-based language models [<xref ref-type="bibr" rid="ref22">22</xref>]. Similar to many deep-network models, Transformer has an encoder and a decoder. The encoder converts the input data into meaningful codes (vector or matrix), while reducing the dimension size (a major bottleneck for data analysis), and the decoder converts the code to output [<xref ref-type="bibr" rid="ref26">26</xref>]. Taking translation as an example, the encoder converts an English sentence into a digital vector in latent space, and the decoder then converts the digital vector into a corresponding sentence in the desired language. The encoder of Transformer has an embedding model, a repeating block model with a multihead self-attention model, and a feedforward model with an architecture based on the shortcut connections concept [<xref ref-type="bibr" rid="ref27">27</xref>] and layer normalization [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p>
        <p>The automatic text summarization task has two branches: extractive and abstractive [<xref ref-type="bibr" rid="ref29">29</xref>]. The extractive branch identifies keywords or sentences as summaries without changing the original document, while the abstractive branch adapts a new short sentence. The diagnosis summarizes the entire admission course, including the chief complaints and treatment course, in highly concentrated and meaningful sentences that help other staff members to quickly manage patients. Because patients in the emergency department have many underlying diseases, along with the high complexity of the conditions of individual patients, incomplete sentences, grammatical issues, and some subordinate prompts, the diagnosis obtained may not be concise. Consequently, the staff needs to include an abundance of words in their diagnoses to best represent the condition of the patient. These rich vocabularies involve not only specific disease terms but also important treatments that are delivered in the course of admission and are associated with verbose text related to diagnoses. Therefore, it is necessary to further summarize the diagnoses using an extractive summarization approach.</p>
        <p>The extractive summarization model can be simplified to a regression problem that outputs the probability of choosing or not choosing. Taking a single character as the token unit, this problem is similar to the segmentation problem in computer vision [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], which outputs the class probability by pixels. A BERT-based model is the superior choice in this context since the attention weight is similar to the extraction probability [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>] and Transformer was reported to exhibit higher performance with the language model than convolutional neural networks, recurrent neural networks, or the LSTM model [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>BERT is a state-of-the-art language model for many NLP tasks that is pretrained with unsupervised learning, including “masked language modeling” and “next-sentence prediction.” BERT is pretrained through several corpus datasets, which are then transferred to learning through supervised data [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>] to defeat other language models in several competitions [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. The pretrained model is available [<xref ref-type="bibr" rid="ref37">37</xref>] and can be fine-tuned for many scenarios.</p>
        <p>Because English is not the native language in Taiwan, there are various typos and spelling errors in free-text medical records. Use of the word-level method [<xref ref-type="bibr" rid="ref38">38</xref>], which is based on Word2vec [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>], can result in this out-of-vocabulary obstacle. In addition, the internal structure of the word is also important and improves vector representation [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. This obstacle can be overcome by adopting the character-level method [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>], which uses a single character or letter as the analysis unit, or the byte-pair encoding (BPE) model, which breaks down each word into multiple subword units (ie, “word pieces”) [<xref ref-type="bibr" rid="ref45">45</xref>]. These methods can decrease the total vocabulary and can also handle rare words, typos, and spelling errors. The word-level and BPE methods were adopted in BERT, resulting in a comprehensive and adaptable model for many types of NLP tasks.</p>
        <p>In EHRs, medical terms, abbreviations, dates, and some count numbers for treatment are rarely found in the general corpus dataset, and will result in poor performance of the model. BioBERT, which is based on the BERT model and uses the same tokenizer, is obtained through advanced training on a biomedical corpus [<xref ref-type="bibr" rid="ref46">46</xref>], and was considered to be well-suited to address our study aims. However, the general computing environments of some medical centers have limited capability to train or fine-tune a heavy model (involving approximately 1 billion parameters) in BERT. Therefore, replacing token units with a character-level method can further reduce the vocabulary and model size, enabling the use of the internal structures of words to avoid the out-of-vocabulary problem.</p>
      </sec>
      <sec>
        <title>Objective</title>
        <p>Our goal was to build a diagnoses-extractive summarization model that can run on the limited computing resources of hospital information systems with good performance. Therefore, we present AlphaBERT, a BERT-based model using the English alphabet (character-level) as the token unit. We compared the performance of AlphaBERT and the number of parameters with those of the other existing models described above.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Materials</title>
        <p>A dataset of 258,050 discharge diagnoses was obtained from the National Taiwan University Hospital Integrated Medical Database (NTUH-iMD). The discharge diagnoses originated from the following departments (in descending order): surgery, internal medicine, obstetrics and gynecology, pediatrics, oncology, orthopedic surgery, urology, otolaryngology, ophthalmology, traumatology, dentistry, neurology, family medicine, psychiatry, physical medicine and rehabilitation, dermatology, emergency medicine, geriatrics, and gerontology. This study was approved by Research Ethics Committee B, National Taiwan University Hospital (201710066RINB).</p>
        <p>In the pretraining stage, 71,704 diagnoses collected by the ICD 10th Revision (ICD-10) [<xref ref-type="bibr" rid="ref3">3</xref>] were also used, and the 258,050 discharge diagnoses were split into 245,148 (95.00%) as the pretrained training dataset and 12,902 (5.00%) as the pretrained validation dataset. In the fine-tuning stage, the extractive summary for supervised learning was labeled by three experienced doctors who have worked in the emergency department for more than 8 years. The fine-tuned dataset included 2530 training labels from the pretrained training dataset, and 250 validation labels and 589 testing labels from the pretrained validation dataset (<xref rid="figure1" ref-type="fig">Figure 1</xref>). We fed the model using 589 data entries in the fine-tuning testing set and obtained a predicted proposal for performance evaluation.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Pretrained validation dataset. ICD: International Statistical Classification of Diseases and Related Health Problems.</p>
          </caption>
          <graphic xlink:href="medinform_v8i4e17787_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Implementation Details</title>
        <p>The hardware used for implementation was an I7 5960x CPU, with 60 G RAM, and 2 Nvidia GTX 1080 Ti GPUs. The software used were Ubuntu 18.04 [<xref ref-type="bibr" rid="ref47">47</xref>], Anaconda 2019.03 [<xref ref-type="bibr" rid="ref48">48</xref>], and PyTorch 1.2.0 [<xref ref-type="bibr" rid="ref49">49</xref>].</p>
      </sec>
      <sec>
        <title>Label Data</title>
        <p>We created a diagnosis-label tool to print the discharge diagnosis from the dataset in a textbox. Doctors highlighted the discharge diagnoses by selecting words that were considered to be most relevant, and the tool identified the highlighted position characters, which were labeled 1 and the others were labeled 0. For example, “1.Bladder cancer with” was labeled “001111111111111110000” and stored in the label dataset. We encouraged doctors to skip short diagnoses, because the summarization service will be more useful for longer diagnoses. Therefore, only longer diagnoses were labeled and collected in the fine-tuning set. </p>
      </sec>
      <sec>
        <title>Data Augmentation</title>
        <p>In this study, the pretraining dataset was smaller than the dataset used in the pretrained model of BERT and its extensions [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Because the diagnoses included several independent diagnoses such as hypertension, cellulitis, and colon cancer, we augmented the pretraining dataset by stitching many diagnoses derived from ICD codes or NTUH-iMD. Accordingly, data augmentation was performed by selecting between 1 and 29 random diagnostic data entries from the dataset and combining them into longer and more complex diagnoses as the pretrained dataset. We set all diagnoses to a maximum of 1350 characters because of GPU memory limitations.</p>
        <p>Because there was also a significant shortage of fine-tuning data, the same data augmentation strategy was used to extend the fine-tuning dataset. To provide greater tolerance for typos, we also randomly replaced 0.1% of the characters in the diagnoses during the fine-tuning stage.</p>
      </sec>
      <sec>
        <title>Preprocess and Tokenization</title>
        <p>We retained only 100 symbols, including letters, numbers, and some punctuation. All free-text diagnoses were preprocessed by filters, and symbols outside of the reserved list were replaced with spaces. Original letter cases (uppercase and lowercase) were retained for analysis.</p>
        <p>The preprocessing of diagnoses then converted the symbols (letters, numbers, and punctuation) into numbers with a one-to-one correspondence. For example, “1.Bladder cancer with” was converted to the array “14, 11, 31, 68, 57, 60, 60, 61, 74, 0, 59, 57, 70, 59, 61, 74, 0, 79, 65, 76, 64.”</p>
      </sec>
      <sec>
        <title>Model Architecture</title>
        <p>The architecture of AlphaBERT is based on that of BERT, and our model is based on the PyTorch adaptation released by the HuggingFace team [<xref ref-type="bibr" rid="ref37">37</xref>] . In this study, we used a 16-layer Transformer encoder with 16 self-attention heads and a hidden size of 64. Character-level tokenizers were used as the token generator of AlphaBERT. There are 963,496 parameters in the whole model, and the symbols are represented by tokenization as one-hot encoding, corresponding to each vector with a hidden size of 64 as the token embeddings. The position embeddings (hidden size 64) are trainable vectors that correspond to the position of the symbol [<xref ref-type="bibr" rid="ref21">21</xref>], in which the maximum length of position embeddings is set to 1350. The summation of the token embeddings and position embeddings is then used as the input embeddings (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) as input to AlphaBERT (<xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Deep-learning model architecture.</p>
          </caption>
          <graphic xlink:href="medinform_v8i4e17787_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Pretraining Stage</title>
        <p>The two-stage learning approach of BERT [<xref ref-type="bibr" rid="ref21">21</xref>] is based on an unsupervised feature-based method, which then transfers the learning to supervised data. The unsupervised pretraining stage of BERT uses a masked language model procedure called a “cloze procedure” [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. Since AlphaBERT was used as the character-level token model, and we used “^” as the “[MASK]” in BERT, we randomly selected 15% of the character sequence, 80% of which was replaced by “^,” 10% was replaced with letters, and the remaining 10% was left unchanged. After the loss converged, we then masked the entire word to further pretrain our model.</p>
        <p>Because the free-text diagnoses contained dates, chemotherapy cycles, cancer staging index, and punctuation marks, these words were nonprompted, nongeneric, and changed sequentially. Even experienced doctors cannot recover hidden dates or cycles without prompts, and therefore the letters were replaced with other letters, numbers were replaced with other numbers, and punctuation marks were replaced with other punctuation marks (but were still randomly selected to mask by “^”).</p>
        <p>In the masked language model used in this study, the BERT model was connected to a fully connected network decoder <bold>A</bold>, which then transformed the 64-dimensional hidden size to a 100-dimensional symbol list size corresponding to the probability <italic>p</italic> of each symbol. The loss function <italic>Loss<sup>mask</sup></italic> is the cross-entropy among the probabilities of each symbol (left side of <xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
        <p>
          <graphic xlink:href="medinform_v8i4e17787_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where <italic>E<sup>mask</sup></italic> denotes the input embedding converted from masking characters, <italic>BERT ()</italic> is the BERT model, <italic>A</italic> () is the fully connected linear decoder to each preserved character, <italic>p</italic> is the probability function, and <italic>1<sub>i</sub><sup>mask</sup></italic> denotes the <italic>i<sub>th</sub></italic> character masked.</p>
      </sec>
      <sec>
        <title>Fine-Tuning Stage</title>
        <p>Another fully connected network, <bold><italic>S</italic></bold><italic>,</italic> decoded the results of the multi-layer Transformer encoder to the predicted probability <italic>p</italic>. The output size of the decoder <bold><italic>S</italic></bold> is two-dimensional, which indicated the possibility of selection. The loss function <bold><italic>Loss</italic></bold> is the cross-entropy among <italic>p</italic> and the ground truth (right side of <xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
        <p>
          <graphic xlink:href="medinform_v8i4e17787_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where <italic>S</italic> () is the full connected linear decoder for selection.</p>
      </sec>
      <sec>
        <title>Cleanup Method</title>
        <p>When we evaluated our model, the probability of each word was represented by the mean probability of each character in the word. In this method, we split the characters list <italic>C =</italic> [<italic>c<sub>1</sub>, c<sub>2</sub>,...c<sub>n</sub></italic>] into a list of several word sets <italic>W</italic> = [<italic>w<sub>1</sub>, w<sub>2</sub>, ..., w<sub>k</sub></italic>], <italic>k</italic> ≤ <italic>n</italic>, where the cleanup probability <italic>p̂<sub>i</sub></italic> of each <italic>c<sub>i</sub></italic> will be the average of all probabilities in <italic>w<sub>m</sub></italic> that contain <italic>c<sub>i</sub></italic>.</p>
        <p>
          <graphic xlink:href="medinform_v8i4e17787_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where <italic>p</italic> denotes the probability after clean up, <italic>w<sub>m</sub></italic> denotes the sequences of characters belonging to the <italic>m<sub>th</sub></italic> word, and <italic>n</italic>() is the length of the unit in the set.</p>
      </sec>
      <sec>
        <title>BERT Models for Extractive Summarization</title>
        <p>We also compared the state-of-the-art models and adjusted them to fit the target task. The purpose of these models was not summarization, and there is no well-presented, fine-tuned model for this purpose available. Based on the word pieces BPE method [<xref ref-type="bibr" rid="ref45">45</xref>], all words were split into several element tokens and then the predicted result was associated with the word pieces. Accordingly, for this task, we filtered out the punctuation marks and added “[CLS]” in the head of every word (<italic>E<sup>head</sup></italic>) o represent the entire word, which prevented fragmented results.</p>
        <p>
          <graphic xlink:href="medinform_v8i4e17787_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>Where <italic>E<sup>head</sup></italic> denotes the input embedding converted from a word (with head) and <italic>1<sub>i</sub><sup>head</sup></italic> denotes that the <italic>i<sub>th</sub></italic> character is a head token.</p>
      </sec>
      <sec>
        <title>LSTM Model for Extractive Summarization</title>
        <p>We also used the LSTM model [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] for this summarization task. To achieve effective comparison with our model, we pretrained the input embedding using Word2vec [<xref ref-type="bibr" rid="ref39">39</xref>] and adopted a 9-layer bidirectional LSTM with 899,841 parameters, which was very similar to our model.</p>
        <p>
          <graphic xlink:href="medinform_v8i4e17787_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
      </sec>
      <sec>
        <title>Hyperparameters</title>
        <p>We used Adam optimization [<xref ref-type="bibr" rid="ref51">51</xref>] with a learning rate of 1×10<sup>–5</sup> in the warmup phase [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>], and then switched to a rate of 1×10<sup>–4</sup> and a minibatch size of 2. The hyperparameter used in this study was the threshold to the character-level probability of selection, which was chosen using a receiver operating characteristic (ROC) curve and <italic>F1</italic> statistic counting from the fine-tuning validation set (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
      </sec>
      <sec>
        <title>Measurement</title>
        <p>We measured the performance of the various models using the ROC curve, an <italic>F1</italic> statistic, and the <italic>F1</italic> statistic of Recall-Oriented Understudy for Gisting Evaluation (ROUGE) [<xref ref-type="bibr" rid="ref54">54</xref>]. To maintain measurement consistency, we filtered out all punctuation in the predicted proposals, counted the results at the word level, and collected physicians’ feedback for each model. A questionnaire website was established in which the original diagnoses were randomly selected and displayed in the first part, and the ground truth summary proposal determined by testing labels and proposals predicted by models were displayed in the second part under random sorting. We recruited 14 experienced physicians for this purpose, including the chief resident, 10 attending physicians of the emergency department at the medical center, one emergency department attending physician at the regional hospital, and two emergency attending physicians at the district hospital. They entered a score of 0-3 for each proposal, in which 0 represented “nonsensical” and 3 represented “good.”</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>Data were analyzed using the statistical package RStudio (version 1.2.5019) based on R (version 3.6.1; R Foundation for Statistical Computing, Vienna, Austria). For group comparisons, we performed the pairwise paired <italic>t</italic> test on the dependent variables of the physician scores and set the significance threshold level to <italic>P</italic>&#60;.05. </p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>The discharge diagnoses dataset included 57,960 lowercase English words. The maximum number of words in a diagnosis was 654 (3654 characters), with a mean of 55 (SD 51) words corresponding to 355 (SD 318) characters. In the fine-tuning dataset, the mean number of words in the diagnoses and summary were 78 (SD 56) and 12 (SD 7), respectively. The retention ratio [<xref ref-type="bibr" rid="ref55">55</xref>] (ie, words in the summary divided by words in the diagnoses) was 12 out of 78 words (15%). The fine-tuning testing set included 138 diagnoses with incorrect words, and a total of 183 incorrect words were counted manually by two attending physicians, including 153 misspellings, 13 typos, 14 inappropriate words, and 3 repeated words. </p>
      <p>Our proposed model, AlphaBERT, demonstrated the highest performance among all compared models with an area under the ROC curve (AUROC) of 0.947, and the LSTM demonstrated the worst performance with an AUROC of 0.899 (<xref rid="figure3" ref-type="fig">Figure 3</xref>).</p>
      <fig id="figure3" position="float">
        <label>Figure 3</label>
        <caption>
          <p>Model receiver operating characteristic (ROC) curves.</p>
        </caption>
        <graphic xlink:href="medinform_v8i4e17787_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>BioBERT achieved the highest ROUGE scores (<xref ref-type="table" rid="table1">Table 1</xref>). BERT and the proposed model were in the intermediate range, with the lowest scores obtained with the LSTM. In addition, the ROUGE score was the highest for reference Doctor A and was the lowest for Doctor C (<xref ref-type="table" rid="table1">Table 1</xref>). When there were incorrect words in the input diagnoses, the performance of all models deteriorated (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
      <p>We collected 246 critical scores from the 14 doctors that responded to the questionnaire. Statistically significant differences (based on the paired <italic>t</italic> test) were detected within the LSTM compared to the reference, BERT, BioBERT, and our proposed model, but not with respect to the other models (<xref ref-type="table" rid="table3">Table 3</xref>).</p>
      <p>We built the service on a website [<xref ref-type="bibr" rid="ref56">56</xref>] using a server with only one CPU (no GPU) on the Microsoft Azure platform to provide a diagnoses-extractive summarization service. Editorial suggestions are also available on the website to gather user feedback and to continue to improve the model. The source code is available on GitHub [<xref ref-type="bibr" rid="ref57">57</xref>]. The service is currently being integrated into the hospital information system to enhance the capabilities of hospital staff.</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Model parameters and ROUGE<sup>a</sup> F1 results.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="260"/>
          <col width="170"/>
          <col width="170"/>
          <col width="0"/>
          <col width="150"/>
          <col width="0"/>
          <col width="220"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Model</td>
              <td>Dr A (n=250)</td>
              <td>Dr B (n=248)</td>
              <td colspan="2">Dr C (n=91)</td>
              <td colspan="2">Mean <italic>F1</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="2">
                <bold>BERT<sup>b</sup> (108,523,714 parameters)</bold>
              </td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-1<sup>c</sup></td>
              <td>0.761</td>
              <td colspan="2">0.693</td>
              <td colspan="2">0.648</td>
              <td>0.715</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-2<sup>d</sup></td>
              <td>0.612</td>
              <td colspan="2">0.513</td>
              <td colspan="2">0.473</td>
              <td>0.549</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-L<sup>e</sup></td>
              <td>0.748</td>
              <td colspan="2">0.671</td>
              <td colspan="2">0.627</td>
              <td>0.697</td>
            </tr>
            <tr valign="top">
              <td colspan="2">
                <bold>BioBERT<sup>f</sup> (108,523,714 parameters)</bold>
              </td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-1</td>
              <td>0.788</td>
              <td colspan="2">0.697</td>
              <td colspan="2">0.647</td>
              <td>0.728</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-2</td>
              <td>0.642</td>
              <td colspan="2">0.523</td>
              <td colspan="2">0.464</td>
              <td>0.565</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-L</td>
              <td>0.773</td>
              <td colspan="2">0.678</td>
              <td colspan="2">0.629</td>
              <td>0.711</td>
            </tr>
            <tr valign="top">
              <td colspan="2">
                <bold>LSTM<sup>g</sup> (899,841 parameters)</bold>
              </td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-1</td>
              <td>0.701</td>
              <td colspan="2">0.647</td>
              <td colspan="2">0.618</td>
              <td>0.666</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-2</td>
              <td>0.531</td>
              <td colspan="2">0.468</td>
              <td colspan="2">0.459</td>
              <td>0.494</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-L</td>
              <td>0.684</td>
              <td colspan="2">0.629</td>
              <td colspan="2">0.602</td>
              <td>0.648</td>
            </tr>
            <tr valign="top">
              <td colspan="2">
                <bold>Proposed model (963,496 parameters)</bold>
              </td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-1</td>
              <td>0.769</td>
              <td colspan="2">0.678</td>
              <td colspan="2">0.647</td>
              <td>0.712</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-2</td>
              <td>0.610</td>
              <td colspan="2">0.482</td>
              <td colspan="2">0.463</td>
              <td>0.533</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ROUGE-L</td>
              <td>0.751</td>
              <td colspan="2">0.656</td>
              <td colspan="2">0.632</td>
              <td>0.693</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>ROUGE: Recall-Oriented Understudy for Gisting Evaluation.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>ROUGE-1: Recall-Oriented Understudy for Gisting Evaluation with unigram overlap.</p>
          </fn>
          <fn id="table1fn4">
            <p><sup>d</sup>ROUGE-2: Recall-Oriented Understudy for Gisting Evaluation with bigram overlap.</p>
          </fn>
          <fn id="table1fn5">
            <p><sup>e</sup>ROUGE-L: Recall-Oriented Understudy for Gisting Evaluation for the longest common subsequence (n) representing the number of reference labels.</p>
          </fn>
          <fn id="table1fn6">
            <p><sup>f</sup>BioBERT: Bidirectional Encoder Representations from Transformers trained on a biomedical corpus.</p>
          </fn>
          <fn id="table1fn7">
            <p><sup>g</sup>LSTM: Long Short-Term Memory.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>ROUGE<sup>a</sup> F1 results of diagnoses with incorrect words.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="400"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <thead>
            <tr valign="bottom">
              <td>ROUGE-L<sup>b</sup></td>
              <td>BERT<sup>c</sup></td>
              <td>BioBERT<sup>d</sup></td>
              <td>LSTM<sup>e</sup></td>
              <td>Proposed Model</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Diagnoses without error words (n=451)<sup>f</sup></td>
              <td>0.704</td>
              <td>0.717</td>
              <td>0.651</td>
              <td>0.698</td>
            </tr>
            <tr valign="top">
              <td>Diagnoses with incorrect words (n=138)</td>
              <td>0.676</td>
              <td>0.692</td>
              <td>0.640</td>
              <td>0.674</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup>ROUGE: Recall-Oriented Understudy for Gisting Evaluation.</p>
          </fn>
          <fn id="table2fn2">
            <p><sup>b</sup>ROUGE-L: ROUGE for the longest common subsequence.</p>
          </fn>
          <fn id="table2fn3">
            <p><sup>c</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
          </fn>
          <fn id="table2fn4">
            <p><sup>d</sup>BioBERT: Bidirectional Encoder Representations from Transformers trained on a biomedical corpus.</p>
          </fn>
          <fn id="table2fn5">
            <p><sup>e</sup>LSTM: Long Short-Term Memory.</p>
          </fn>
          <fn id="table2fn6">
            <p><sup>f</sup>n represents the number of reference labels.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Critique scores of models from doctors (N=246).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="140"/>
          <col width="180"/>
          <col width="170"/>
          <col width="170"/>
          <col width="170"/>
          <col width="170"/>
          <thead>
            <tr valign="top">
              <td rowspan="2">Model</td>
              <td rowspan="2">Score, mean (SD)</td>
              <td colspan="4"><italic>P</italic> value</td>
            </tr>
            <tr valign="bottom">
              <td>BERT<sup>a</sup></td>
              <td>BioBERT<sup>b</sup></td>
              <td>LSTM<sup>c</sup></td>
              <td>Proposed Model</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Reference</td>
              <td>2.232 (0.832)</td>
              <td>.11</td>
              <td>.66</td>
              <td>&#60;.001</td>
              <td>.10</td>
            </tr>
            <tr valign="top">
              <td>BERT</td>
              <td>2.134 (0.877)</td>
              <td>
                <break/>
              </td>
              <td>.10</td>
              <td>.001</td>
              <td>.89</td>
            </tr>
            <tr valign="top">
              <td>BioBERT</td>
              <td>2.207 (0.844)</td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>&#60;.001</td>
              <td>.19</td>
            </tr>
            <tr valign="top">
              <td>LSTM</td>
              <td>1.927 (0.910)</td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>.002</td>
            </tr>
            <tr valign="top">
              <td>Proposed</td>
              <td>2.126 (0.874)</td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table3fn1">
            <p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
          </fn>
          <fn id="table3fn2">
            <p><sup>b</sup>BioBERT: Bidirectional Encoder Representations from Transformers trained on a biomedical corpus.</p>
          </fn>
          <fn id="table3fn3">
            <p><sup>c</sup>LSTM: Long Short-Term Memory.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>AlphaBERT effectively performed the extractive summarization task on medical clinic notes and decreased the model size compared to BERT, reducing the number of parameters from 108,523,714 to 963,496 using a character-level tokenizer. AlphaBERT showed similar performance to BERT and BioBERT in this extractive summarization task. In spite of the heavy model, both BERT and BioBERT were demonstrated to be excellent models and well-suited for several tasks (including the primary task of this study) with small adjustments. For convenience, the model can be used in a straightforward manner to rapidly build new apps in the medical field. Because of the well pretrained NLP feature extraction model, a small label dataset (the fine-tuning training set includes only 2530 cases) is sufficient for supervised learning and achieving the goal. </p>
        <p>In this study, we obtained high ROUGE <italic>F1</italic> scores for all models. In general summarization studies, the ROUGE <italic>F1</italic> score was typically less than 0.40 [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>], whereas we achieved a score of 0.71, which corresponds with a higher retention ratio (15%) for this task than the corpus of other summarization tasks such as the CNN/Daily Mail Corpus (approximately 7%) [<xref ref-type="bibr" rid="ref7">7</xref>]. Since the diagnosis can be considered as a summary of admission records, a higher retention rate is reasonable; however, for emergencies, the diagnosis will contain too many redundant words in some cases. </p>
        <p>The ICD-10 is a well-classified system with more than 70,000 codes, but is often too simple to fully capture the complex context of a patient’s record. The treatments during the patient’s previous hospitalization are also important to consider, and are often recorded as a free-text diagnosis when the patient has revisited a hospital under critical status. For example, if a patient has cancer, the previous chemotherapy course is important information when the patient is seriously ill in the emergency department. Furthermore, it is difficult for doctors to accurately find the correct codes; thus, it is insufficient to represent a patient’s condition by simply obtaining the ICD-10 code from the EHR. However, the ICD-10 codes can be used to extend the pretrained training set by random stitching. </p>
        <p>Combining a random number of diagnoses not only extends the training dataset but also improves the performance of the model. The average number of characters in a diagnosis was 355, but the range was larger (SD 318). In the absence of augmentation, the position embeddings and self-attention heads trained more in the front and demonstrated poorer performance in the back. Augmentation combines several diagnoses to lengthen the input embeddings, which can train the self-attention heads to consider all 1350 characters equally.</p>
        <p>In the prediction phase, we obtained the probability of each character. Since a word is split into a sequence of characters, the result is fragmented, and only some characters in a word were selected by prediction. This results in a nonsense phrase and produces poor results. Accordingly, we proposed a cleanup method that selects the entire word based on the probability of all characters being present in the word. This concept is derived from the segmentation task in computer vision in which each pixel has the possibility of classifying and causing the predictions to not continue. In the field of computer vision, contour-based superpixels are chosen, and all superpixels are selected by a majority vote [<xref ref-type="bibr" rid="ref31">31</xref>]. In this study, the average probability of an entire word represents the probability of each character and results in either the entire word being selected or none at all.</p>
        <p>Since the summarization task is subjective, properly evaluating the performance of the model is a relevant consideration. Lack of adequate medical labels is an important issue, because labels from qualified physicians are rare and difficult to collect. Although the ROUGE score [<xref ref-type="bibr" rid="ref54">54</xref>] is widely used in this field, it is evaluated by the same doctors’ labels and even by separate split sets.</p>
        <p>Owing to the lack of doctors who are capable of labeling the reference summaries, all of the models evaluated in this study were limited to being fine-tuned by Doctor A’s labels. We were able to shuffle and randomly split the three doctors’ labels to training, validation, and testing sets, but we did not have reference labels from other doctors to confirm whether individual variation exists. Even when using the three doctors’ labels, this problem would occur when gathering another doctor’s labels. </p>
        <p>To confirm the differences from other doctors, the models were fine-tuned using only one doctor’s knowledge, with the others’ used as a test set. The results revealed a difference according to the ROUGE scores (<xref ref-type="table" rid="table1">Table 1</xref>) from the three doctors. The model had a poor ROUGE score on the label references for Doctor C, implying that summarization is a highly subjective task. Certain words are important for some doctors, but not for others, even among doctors in the same medical field who have similar interpretation processes. Therefore, it was very easy to overfit the model with the summarization task. BioBERT had the most accurate prediction result, but the associated overfitting was also more severe.</p>
        <p>We established a website for doctors to easily critique the performance within label references and the predictions from the models to further objectively evaluate the performance of the model and the reference labels from doctors. We used a double-blind method to collect scores, and the system randomly chose a diagnosis and displayed corresponding summary proposals by random ordering. The critical reviewer was therefore blinded to the method used for each prediction. We obtained similar results to the ROUGE scores from this analysis. Moreover, the LSTM was consistently the lowest-performing model, whereas manually labeled references achieved the highest average score, followed by BioBERT.</p>
        <p>Although the performance of AlphaBERT was not optimum, there was nevertheless no statistically significant difference between the performances of BERT, BioBERT, and AlphaBERT. The advantage of AlphaBERT is the character-level prediction probability and its one-to-one correspondence with the original document. The predicted keywords can be highlighted directly on the original document and can be easily edited by users. For example, although AlphaBERT’s predicted proposal had a ROUGE-L score of 0.701, it makes sense to recognize important words, which is perhaps more informative than a doctor’s reference label (<xref rid="figure4" ref-type="fig">Figure 4</xref>). In some cases, our proposed method could predict more information about the disease and related treatments, whereas in other cases some diseases were lost (eg, pneumonia, hypertension, and respiratory failure), and in other cases the formal medical term was predicted but the reference label was an abbreviation (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). This variation also reflects the subjectivity of the summary task.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Illustration of the performance of AlphaBERT.</p>
          </caption>
          <graphic xlink:href="medinform_v8i4e17787_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Due to the subjective nature of the text summarization task, the predicted summary results may lose some information that may be of relevance. The proposed model helps hospital staff to quickly view information for a large number of patients at the beginning of a shift; however, they will still need to read all of the collected information from the EHRs during ward rounds. </p>
        <p>Typos and misspellings remain a problem in NLP. However, the character-level and word pieces BPE method can not only reduce the vocabulary but can also handle typos effectively to maintain noninferior results (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). Although automatic spelling correction may be a solution to this problem, we have not included this feature in our proposed method because we are confident in the robust error tolerance of the character-level and BPE method.</p>
        <p>This was a pilot study in the medical text summarization field based on the deep-learning method. We plan to establish a website that offers this service and provides a way to edit suggestions and feedback to collect volunteer labels and resolve personal variability in the near future. </p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>AlphaBERT, using character-level tokens in a BERT-based model, can greatly decrease model size without significantly reducing performance for text summarization tasks. The proposed model will provide a method to further extract the unstructured free-text portions in EHRs to obtain an abundance of health data. As we enter the forefront of the artificial intelligence era, NLP deep-learning models are well under development. In our model, all medical free-text data can be transformed into meaningful embeddings, which will enhance medical studies and strengthen doctors’ capabilities.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Input embedding.</p>
        <media xlink:href="medinform_v8i4e17787_app1.PNG" xlink:title="PNG File , 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Flowchart to determine the hyperparameters and measure the model’s performance.</p>
        <media xlink:href="medinform_v8i4e17787_app2.PNG" xlink:title="PNG File , 35 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Error statistics (strong and weak).</p>
        <media xlink:href="medinform_v8i4e17787_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 409 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Error statistics (typos, misspellings, or incorrect words).</p>
        <media xlink:href="medinform_v8i4e17787_app4.pdf" xlink:title="PDF File  (Adobe PDF File), 703 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUROCs</term>
          <def>
            <p>Area Under the Receiver Operating Characteristics</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BPE</term>
          <def>
            <p>byte-pair encoding</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">ICD-10</term>
          <def>
            <p>International Statistical Classification of Diseases and Related Health Problems 10th Revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">NTUH-iMD</term>
          <def>
            <p>National Taiwan University Hospital Integrated Medical Database</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">ROC</term>
          <def>
            <p>receiver operating characteristic</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">ROUGE</term>
          <def>
            <p>Recall-Oriented Understudy for Gisting Evaluation</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We would like to thank the Ministry of Science and Technology, Taiwan, for financially supporting this research (grant MOST 109-2634-F-002-029). We would also like to thank Yun-Nung Chen for providing useful comments on this work and Hugging face for providing several excellent deep-learning codings. We are grateful to GitHub for providing the code repository used for AlphaBERT.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Juang</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Emergency department overcrowding: Quality improvement in a Taiwan Medical Center</article-title>
          <source>J Formos Med Assoc</source>
          <year>2019</year>
          <month>01</month>
          <volume>118</volume>
          <issue>1</issue>
          <fpage>186</fpage>
          <lpage>193</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0929-6646(17)30790-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jfma.2018.03.008</pub-id>
          <pub-id pub-id-type="medline">29665984</pub-id>
          <pub-id pub-id-type="pii">S0929-6646(17)30790-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hsieh</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Professional resilience among nurses working in an overcrowded emergency department in Taiwan</article-title>
          <source>Int Emerg Nurs</source>
          <year>2019</year>
          <month>01</month>
          <volume>42</volume>
          <fpage>44</fpage>
          <lpage>50</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ienj.2018.05.005</pub-id>
          <pub-id pub-id-type="medline">29954706</pub-id>
          <pub-id pub-id-type="pii">S1755-599X(18)30069-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <source>World Health Organization</source>
          <access-date>2016-01-01</access-date>
          <comment>ICD-10 Version:2019<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://icd.who.int/browse10/2019/en">https://icd.who.int/browse10/2019/en</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ford</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cassell</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Extracting information from the text of electronic medical records to improve case detection: a systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2016</year>
          <month>12</month>
          <volume>23</volume>
          <issue>5</issue>
          <fpage>1007</fpage>
          <lpage>1015</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26911811"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv180</pub-id>
          <pub-id pub-id-type="medline">26911811</pub-id>
          <pub-id pub-id-type="pii">ocv180</pub-id>
          <pub-id pub-id-type="pmcid">PMC4997034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Workman</surname>
              <given-names>TE</given-names>
            </name>
            <name name-style="western">
              <surname>Fiszman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hurdle</surname>
              <given-names>JF</given-names>
            </name>
          </person-group>
          <article-title>Text summarization as a decision support aid</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2012</year>
          <month>05</month>
          <day>23</day>
          <volume>12</volume>
          <fpage>41</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-12-41"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-12-41</pub-id>
          <pub-id pub-id-type="medline">22621674</pub-id>
          <pub-id pub-id-type="pii">1472-6947-12-41</pub-id>
          <pub-id pub-id-type="pmcid">PMC3461485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gigioli</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sagar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Voyles</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Domain-Aware Abstractive Text Summarization for Medical Documents</article-title>
          <year>2018</year>
          <conf-name>IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name>
          <conf-date>December 3-6, 2018</conf-date>
          <conf-loc>Madrid, Spain</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/8621539"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/BIBM.2018.8621539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nallapati</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gulcehre</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Abstractive Text Summarization using Sequence-to-sequence RNNs and Beyond</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2016</year>
          <month>08</month>
          <conf-name>Proceedings of The 20th SIGNLL Conference on Computational Natural Language Learning</conf-name>
          <conf-date>August 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/K16-1028/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/K16-1028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>See</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Get To The Point: Summarization with Pointer-Generator Networks</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2017</year>
          <month>07</month>
          <conf-name>Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name>
          <conf-date>July 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>1073</fpage>
          <lpage>1083</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/P17-1099"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/P17-1099</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Neural Document Summarization by Jointly Learning to Score and Select Sentences</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2018</year>
          <month>07</month>
          <conf-name>Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name>
          <conf-date>2018/July</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>654</fpage>
          <lpage>663</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/P18-1061"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/p18-1061</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elhadad</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>McKeown</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kaufman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Facilitating physicians' access to information via tailored text summarization</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2005</year>
          <fpage>226</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/16779035"/>
          </comment>
          <pub-id pub-id-type="medline">16779035</pub-id>
          <pub-id pub-id-type="pii">57510</pub-id>
          <pub-id pub-id-type="pmcid">PMC1560854</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Niu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Hirst</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Using outcome polarity in sentence extraction for medical question-answering</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2006</year>
          <fpage>599</fpage>
          <lpage>603</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17238411"/>
          </comment>
          <pub-id pub-id-type="medline">17238411</pub-id>
          <pub-id pub-id-type="pii">85570</pub-id>
          <pub-id pub-id-type="pmcid">PMC1839454</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mollá</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Paris</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Extractive summarisation of medical documents using domain knowledge and corpus statistics</article-title>
          <source>Australas Med J</source>
          <year>2012</year>
          <volume>5</volume>
          <issue>9</issue>
          <fpage>478</fpage>
          <lpage>481</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23115581"/>
          </comment>
          <pub-id pub-id-type="doi">10.4066/AMJ.2012.1361</pub-id>
          <pub-id pub-id-type="medline">23115581</pub-id>
          <pub-id pub-id-type="pii">20121361</pub-id>
          <pub-id pub-id-type="pmcid">PMC3477776</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ranjan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Prakash</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Saha</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Automatic labelling of important terms and phrases from medical discussions</article-title>
          <source>IEEE</source>
          <year>2017</year>
          <month>11</month>
          <day>03</day>
          <conf-name>2017 Conference on Information and Communication Technology (CICT)</conf-name>
          <conf-date>November 3-5, 2017</conf-date>
          <conf-loc>Gwalior, India</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/8340644"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/INFOCOMTECH.2017.8340644</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fiszman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rindflesch</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Kilicoglu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Summarizing drug information in Medline citations</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2006</year>
          <fpage>254</fpage>
          <lpage>258</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17238342"/>
          </comment>
          <pub-id pub-id-type="medline">17238342</pub-id>
          <pub-id pub-id-type="pii">86447</pub-id>
          <pub-id pub-id-type="pmcid">PMC1839479</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fiszman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kilicoglu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Rindflesch</surname>
              <given-names>TC</given-names>
            </name>
          </person-group>
          <article-title>Automatic summarization of MEDLINE citations for evidence-based medical treatment: a topic-oriented evaluation</article-title>
          <source>J Biomed Inform</source>
          <year>2009</year>
          <month>10</month>
          <volume>42</volume>
          <issue>5</issue>
          <fpage>801</fpage>
          <lpage>813</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(08)00126-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2008.10.002</pub-id>
          <pub-id pub-id-type="medline">19022398</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(08)00126-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC2776079</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarkar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Nasipuri</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Using Machine Learning for Medical Document Summarization</article-title>
          <source>Int J Database Theor Appl</source>
          <year>2011</year>
          <month>03</month>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>31</fpage>
          <lpage>48</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://modul.repo.mercubuana-yogya.ac.id/modul/files/openjournal/JournalOfDesign/4_586.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27219127"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goldstein</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shahar</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>An automated knowledge-based textual summarization system for longitudinal, multivariate clinical data</article-title>
          <source>J Biomed Inform</source>
          <year>2016</year>
          <month>06</month>
          <volume>61</volume>
          <fpage>159</fpage>
          <lpage>175</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(16)30015-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2016.03.022</pub-id>
          <pub-id pub-id-type="medline">27039119</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(16)30015-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aramaki</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Miura</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tonoike</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ohkuma</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Masuichi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ohe</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>TEXT2TABLE: Medical Text Summarization System Based on Named Entity Recognition and Modality Identification</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2009</year>
          <month>06</month>
          <conf-name>Proceedings of the BioNLP 2009 Workshop</conf-name>
          <conf-date>June 2009</conf-date>
          <conf-loc>Boulder, Colorado</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>185</fpage>
          <lpage>192</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/W09-1324/"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1572364.1572390</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>Sriram</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bush</surname>
              <given-names>WS</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Haines</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Automated extraction of clinical traits of multiple sclerosis in electronic medical records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2013</year>
          <month>12</month>
          <volume>20</volume>
          <issue>e2</issue>
          <fpage>e334</fpage>
          <lpage>e340</lpage>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2013-001999</pub-id>
          <pub-id pub-id-type="medline">24148554</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2013-001999</pub-id>
          <pub-id pub-id-type="pmcid">PMC3861927</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2019</year>
          <month>06</month>
          <conf-name>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</conf-name>
          <conf-date>June 2019</conf-date>
          <conf-loc>Minneapolis, Minnesota</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>4171</fpage>
          <lpage>4186</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/N19-1423/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Attention Is All You Need</article-title>
          <source>NIPS'17: Proceedings of the 31st International Conference on Neural Information Processing Systems</source>
          <year>2017</year>
          <month>12</month>
          <publisher-loc>Red Hook, NY</publisher-loc>
          <publisher-name>Curran Associates Inc</publisher-name>
          <fpage>6000</fpage>
          <lpage>6010</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Deep contextualized word representations</article-title>
          <year>2018</year>
          <conf-name>2018 Conference of the North American Chapter for Computational Linguistics (NAACL)</conf-name>
          <conf-date>June 1-6, 2018</conf-date>
          <conf-loc>New Orleans, Louisiana</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1202</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lopyrev</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>SQuAD: 10,000+ questions for machine comprehension of text</article-title>
          <year>2016</year>
          <conf-name>Proceedings of the  Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>November 2016</conf-date>
          <conf-loc>Austin, Texas</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>2383</fpage>
          <lpage>2392</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/d16-1264</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hochreiter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidhuber</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Long Short-Term Memory</article-title>
          <source>Neural Computation</source>
          <year>1997</year>
          <month>11</month>
          <volume>9</volume>
          <issue>8</issue>
          <fpage>1735</fpage>
          <lpage>1780</lpage>
          <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Auto-encoding variational bayes</source>
          <year>2013</year>
          <access-date>2013-12-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1312.6114">https://arxiv.org/abs/1312.6114</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Deep residual learning for image recognition</article-title>
          <year>2016</year>
          <conf-name>Proceedings of the IEEE conference on computer vision pattern recognition</conf-name>
          <conf-date>June 26-July 1, 2016</conf-date>
          <conf-loc>Las Vegas, NV</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2016.90</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kiros</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <source>arxiv</source>
          <year>2016</year>
          <month>07</month>
          <day>21</day>
          <access-date>2016-07-21</access-date>
          <comment>Layer normalization<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1607.06450">https://arxiv.org/abs/1607.06450</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hovy</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Chinyew</surname>
              <given-names>Lin</given-names>
            </name>
          </person-group>
          <article-title>Automated text summarization and the SUMMARIST system</article-title>
          <source>Association for Computational Linguistics</source>
          <year>1998</year>
          <month>10</month>
          <day>13</day>
          <conf-name>TIPSTER '98: Proceedings of a workshop on held at Baltimore, Maryland</conf-name>
          <conf-date>October 13-15, 1998</conf-date>
          <conf-loc>Baltimore, Maryland</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>197</fpage>
          <lpage>214</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/W97-0704"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1119089.1119121</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ronneberger</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Fischer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Brox</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>U-net: Convolutional networks for biomedical image segmentation</article-title>
          <year>2015</year>
          <conf-name>International Conference on Medical image computing and computer-assisted intervention</conf-name>
          <conf-date>October 2015</conf-date>
          <conf-loc>Munich, Germany</conf-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>234</fpage>
          <lpage>41</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Caelles</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Maninis</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pont-Tuset</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Leal-Taixé</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cremers</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Van</surname>
              <given-names>GL</given-names>
            </name>
          </person-group>
          <article-title>One-shot video object segmentation</article-title>
          <year>2017</year>
          <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>July 21-26, 2017</conf-date>
          <conf-loc>Honolulu, Hawaii</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2017.565</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Learning to encode text as human-readable summaries using generative adversarial networks</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2018</year>
          <conf-name>2018 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>October-November, 2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D18-1451/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d18-1451</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rush</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chopra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Weston</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A neural attention model for abstractive sentence summarization</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2015</year>
          <conf-name>2015 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>September 2015</conf-date>
          <conf-loc>Lisbon, Portugal</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D15-1044/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d15-1044</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conneau</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kiela</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schwenk</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Barrault</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bordes</surname>
              <given-names>AJ</given-names>
            </name>
          </person-group>
          <article-title>Supervised learning of universal sentence representations from natural language inference data</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2017</year>
          <conf-name>2017 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>September 2017</conf-date>
          <conf-loc>Copenhagen, Denmark</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1705.02364.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d17-1070</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yosinski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Clune</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lipson</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>How transferable are features in deep neural networks?</article-title>
          <year>2014</year>
          <month>12</month>
          <conf-name>NIPS'14: Proceedings of the 27th International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 8-13, 2014</conf-date>
          <conf-loc>Montreal, Canada</conf-loc>
          <publisher-name>MIT Press</publisher-name>
          <fpage>3320</fpage>
          <lpage>3328</lpage>
          <pub-id pub-id-type="doi">10.5555/2969033.2969197</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Carbonell</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Salakhutdinov</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>arXiv</source>
          <year>2019</year>
          <access-date>2019-06-19</access-date>
          <comment>XLNet: Generalized Autoregressive Pretraining for Language Understanding<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1906.08237">https://arxiv.org/abs/1906.08237</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Debut</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sanh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chaumond</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Delangue</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Moi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <source>Huggingface’s transformers: State-of-the-art natural language processing</source>
          <access-date>2019-01-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/">https://huggingface.co/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Desouza</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mercer</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pietra</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Class-based n-gram models of natural language</article-title>
          <source>Comput Ling</source>
          <year>1992</year>
          <fpage>467</fpage>
          <lpage>480</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/J92-4003/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <year>2013</year>
          <conf-name>NIPS'13: Proceedings of the 26th International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 5-10, 2013</conf-date>
          <conf-loc>Lake Tahoe, Nevada</conf-loc>
          <publisher-name>Curran Associates Inc</publisher-name>
          <fpage>3111</fpage>
          <lpage>3119</lpage>
          <pub-id pub-id-type="doi">10.5555/2999792.2999959</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Le</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cerisara</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Denis</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <source>arxiv</source>
          <access-date>2017-07-13</access-date>
          <comment>Do Convolutional Networks need to be Deep for Text Classification?<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1707.04108">https://arxiv.org/abs/1707.04108</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Enriching Word Vectors with Subword Information</article-title>
          <source>Trans Assoc Comput Ling</source>
          <year>2017</year>
          <month>12</month>
          <volume>5</volume>
          <fpage>135</fpage>
          <lpage>146</lpage>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xenouleas</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Malakasiotis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Apidianaki</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>SUM-QE: a BERT-based Summary Quality Estimation Model</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2019</year>
          <month>11</month>
          <conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name>
          <conf-date>November, 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>6005</fpage>
          <lpage>6011</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D19-1618/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/D19-1618</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Rfou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Choe</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Constant</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Character-Level Language Modeling with Deeper Self-Attention</article-title>
          <source>AAAI</source>
          <year>2019</year>
          <month>07</month>
          <day>17</day>
          <conf-name>Character-level language modeling with deeper self-attention</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Proceedings of the AAAI Conference on Artificial Intelligence</conf-loc>
          <fpage>3159</fpage>
          <lpage>3166</lpage>
          <pub-id pub-id-type="doi">10.1609/aaai.v33i01.33013159</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lecun</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Character-level Convolutional Networks for Text Classification</article-title>
          <year>2015</year>
          <conf-name>Advances in Neural Information Processing Systems 28 (NIPS 2015)</conf-name>
          <conf-date>December 7-12, 2015</conf-date>
          <conf-loc>Montreal, Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Schuster</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Norouzi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Macherey</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Google's neural machine translation system: Bridging the gap between human and machine translation</article-title>
          <source>Trans Assoc Comput Ling</source>
          <year>2017</year>
          <month>10</month>
          <volume>5</volume>
          <fpage>339</fpage>
          <lpage>351</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/Q17-1024.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>Jaewoo</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>1240</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="web">
          <source>Ubuntu</source>
          <access-date>2019-01-01</access-date>
          <comment>Download Ubuntu Desktop<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ubuntu.com/download/desktop">https://ubuntu.com/download/desktop</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="web">
          <source>Anaconda</source>
          <access-date>2019-01-01</access-date>
          <comment>Solutions for Data Science Practitioners and Enterprise Machine Learning<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.anaconda.com/">https://www.anaconda.com/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="web">
          <source>PyTorch</source>
          <access-date>2019-01-01</access-date>
          <comment>From Research to Production<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://pytorch.org/">https://pytorch.org/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>WL</given-names>
            </name>
          </person-group>
          <article-title>“Cloze Procedure”: A New Tool for Measuring Readability</article-title>
          <source>Journalism Quart</source>
          <year>1953</year>
          <month>09</month>
          <day>01</day>
          <volume>30</volume>
          <issue>4</issue>
          <fpage>415</fpage>
          <lpage>433</lpage>
          <pub-id pub-id-type="doi">10.1177/107769905303000401</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Adam: A Method for Stochastic Optimization</article-title>
          <year>2015</year>
          <conf-name>International Conference for Learning Representations (ICLR) 2015</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>San Diego, California</conf-loc>
          <publisher-loc>Adam</publisher-loc>
          <fpage>A</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1412.6980"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Szegedy</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ioffe</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vanhoucke</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Alemi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Inception-v4, inception-resnet and the impact of residual connections on learning</article-title>
          <source>AAAI</source>
          <year>2017</year>
          <month>02</month>
          <conf-name>Thirty-First AAAI Conference on Artificial Intelligence</conf-name>
          <conf-date>February 2017</conf-date>
          <conf-loc>San Francisco, California</conf-loc>
          <publisher-name>AAAI Press</publisher-name>
          <fpage>4278</fpage>
          <lpage>4284</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Dollár</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Girshick</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Noordhuis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wesolowski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kyrola</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <source>arvix</source>
          <access-date>2018-04-30</access-date>
          <comment>Accurate, large minibatch sgd: Training imagenet in 1 hour<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1706.02677">https://arxiv.org/abs/1706.02677</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>ROUGE: A Package for Automatic Evaluation of Summaries</article-title>
          <source>Association for Computational Linguistics</source>
          <year>2004</year>
          <month>07</month>
          <conf-name>Workshop on Text Summarization Branches Out, Post-Conference Workshop of ACL 2004</conf-name>
          <conf-date>July 2004</conf-date>
          <conf-loc>Barcelona, Spain</conf-loc>
          <fpage>74</fpage>
          <lpage>81</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/W04-1013"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mitchell-Box</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Braun</surname>
              <given-names>KL</given-names>
            </name>
          </person-group>
          <article-title>Fathers' thoughts on breastfeeding and implications for a theory-based intervention</article-title>
          <source>J Obstet Gynecol Neonatal Nurs</source>
          <year>2012</year>
          <volume>41</volume>
          <issue>6</issue>
          <fpage>E41</fpage>
          <lpage>E50</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1552-6909.2012.01399.x</pub-id>
          <pub-id pub-id-type="medline">22861175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>YP</given-names>
            </name>
          </person-group>
          <source>Azure</source>
          <access-date>2020-01-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://diagnosislabelevaluateweb.azurewebsites.net/Extract">http://diagnosislabelevaluateweb.azurewebsites.net/Extract</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>YP</given-names>
            </name>
          </person-group>
          <source>Github</source>
          <access-date>2020-04-10</access-date>
          <comment>AlphaBERT<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/wicebing/AlphaBERT.git">https://github.com/wicebing/AlphaBERT.git</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
