<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i8e23230</article-id>
      <article-id pub-id-type="pmid">34463639</article-id>
      <article-id pub-id-type="doi">10.2196/23230</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Automatic ICD-10 Coding and Training System: Deep Neural Network Based on Supervised Learning</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kimia</surname>
            <given-names>Amir</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Lim</surname>
            <given-names>Gilbert</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Frontoni</surname>
            <given-names>Emanuele</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Pei-Fu</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff01" ref-type="aff">1</xref>
          <xref rid="aff02" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0192-5377</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Ssu-Ming</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff01" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4359-1683</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Liao</surname>
            <given-names>Wei-Chih</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff01" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2016-3483</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Kuo</surname>
            <given-names>Lu-Cheng</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff03" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4247-5197</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Kuan-Chih</given-names>
          </name>
          <degrees>MD, MSc</degrees>
          <xref rid="aff01" ref-type="aff">1</xref>
          <xref rid="aff04" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6526-8097</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Yu-Cheng</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff05" ref-type="aff">5</xref>
          <xref rid="aff06" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2264-4491</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Chi-Yu</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff07" ref-type="aff">7</xref>
          <xref rid="aff08" ref-type="aff">8</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5793-0397</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Chiu</surname>
            <given-names>Chi-Hao</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff09" ref-type="aff">9</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6807-8387</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Chang</surname>
            <given-names>Shu-Chih</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff10" ref-type="aff">10</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1392-382X</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Lai</surname>
            <given-names>Feipei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff01" ref-type="aff">1</xref>
          <xref rid="aff11" ref-type="aff">11</xref>
          <address>
            <institution>Department of Computer Science and Information Engineering</institution>
            <institution>National Taiwan University</institution>
            <addr-line>No 1, Sec 4, Roosevelt Road</addr-line>
            <addr-line>Taipei, 10617</addr-line>
            <country>Taiwan</country>
            <phone>886 0911126526</phone>
            <email>flai@ntu.edu.tw</email>
          </address>
          <xref rid="aff12" ref-type="aff">12</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7147-8122</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff01">
        <label>1</label>
        <institution>Graduate Institute of Biomedical Electronics and Bioinformatics</institution>
        <institution>National Taiwan University</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff02">
        <label>2</label>
        <institution>Department of Anesthesiology</institution>
        <institution>Far Eastern Memorial Hospital</institution>
        <addr-line>New Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff03">
        <label>3</label>
        <institution>Department of Internal Medicine</institution>
        <institution>National Taiwan University Hospital</institution>
        <institution>National Taiwan University College of Medicine</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff04">
        <label>4</label>
        <institution>Department of Internal Medicine</institution>
        <institution>Far Eastern Memorial Hospital</institution>
        <addr-line>New Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff05">
        <label>5</label>
        <institution>Department of Medical Affairs</institution>
        <institution>Far Eastern Memorial Hospital</institution>
        <addr-line>New Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff06">
        <label>6</label>
        <institution>Department of Healthcare Administration</institution>
        <institution>Oriental Institute of Technology</institution>
        <addr-line>New Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff07">
        <label>7</label>
        <institution>Department of Information Technology</institution>
        <institution>Far Eastern Memorial Hospital</institution>
        <addr-line>New Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff08">
        <label>8</label>
        <institution>Section of Cardiovascular Medicine</institution>
        <institution>Cardiovascular Center</institution>
        <institution>Far Eastern Memorial Hospital</institution>
        <addr-line>New Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff09">
        <label>9</label>
        <institution>Section of Health Insurance</institution>
        <institution>Department of Medical Affairs</institution>
        <institution>Far Eastern Memorial Hospital</institution>
        <addr-line>New Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff10">
        <label>10</label>
        <institution>Medical Records Department</institution>
        <institution>Far Eastern Memorial Hospital</institution>
        <addr-line>New Taipei City</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff11">
        <label>11</label>
        <institution>Department of Computer Science and Information Engineering</institution>
        <institution>National Taiwan University</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff12">
        <label>12</label>
        <institution>Department of Electrical Engineering</institution>
        <institution>National Taiwan University</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Feipei Lai <email>flai@ntu.edu.tw</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>8</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>31</day>
        <month>8</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>8</issue>
      <elocation-id>e23230</elocation-id>
      <history>
        <date date-type="received">
          <day>5</day>
          <month>8</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>19</day>
          <month>1</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>15</day>
          <month>3</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>7</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Pei-Fu Chen, Ssu-Ming Wang, Wei-Chih Liao, Lu-Cheng Kuo, Kuan-Chih Chen, Yu-Cheng Lin, Chi-Yu Yang, Chi-Hao Chiu, Shu-Chih Chang, Feipei Lai. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 31.08.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2021/8/e23230" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The International Classification of Diseases (ICD) code is widely used as the reference in medical system and billing purposes. However, classifying diseases into ICD codes still mainly relies on humans reading a large amount of written material as the basis for coding. Coding is both laborious and time-consuming. Since the conversion of ICD-9 to ICD-10, the coding task became much more complicated, and deep learning– and natural language processing–related approaches have been studied to assist disease coders.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This paper aims at constructing a deep learning model for ICD-10 coding, where the model is meant to automatically determine the corresponding diagnosis and procedure codes based solely on free-text medical notes to improve accuracy and reduce human effort.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used diagnosis records of the National Taiwan University Hospital as resources and apply natural language processing techniques, including global vectors, word to vectors, embeddings from language models, bidirectional encoder representations from transformers, and single head attention recurrent neural network, on the deep neural network architecture to implement ICD-10 auto-coding. Besides, we introduced the attention mechanism into the classification model to extract the keywords from diagnoses and visualize the coding reference for training freshmen in ICD-10. Sixty discharge notes were randomly selected to examine the change in the F<sub>1</sub>-score and the coding time by coders before and after using our model.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>In experiments on the medical data set of National Taiwan University Hospital, our prediction results revealed F<sub>1</sub>-scores of 0.715 and 0.618 for the ICD-10 Clinical Modification code and Procedure Coding System code, respectively, with a <italic>bidirectional encoder representations from transformers</italic> embedding approach in the Gated Recurrent Unit classification model. The well-trained models were applied on the ICD-10 web service for coding and training to ICD-10 users. With this service, coders can code with the F<sub>1</sub>-score significantly increased from a median of 0.832 to 0.922 (<italic>P</italic>&#60;.05), but not in a reduced interval.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The proposed model significantly improved the F<sub>1</sub>-score but did not decrease the time consumed in coding by disease coders.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>deep learning</kwd>
        <kwd>International Classification of Diseases</kwd>
        <kwd>Recurrent Neural Network</kwd>
        <kwd>text classification</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The International Classification of Diseases (ICD) is a medical classification list released by the World Health Organization, which defines the universe of diseases, disorders, injuries, and other related health conditions and the classifying standard of diagnosis [<xref ref-type="bibr" rid="ref1">1</xref>]. Since the first publication in 1893, the ICD has become one of the most important indexes in medical management systems, health insurance, or literature research.</p>
      <p>At present, in most medical institutions, ICD-10 codes that are used in diagnostic related group subsidy for inpatients mainly rely on manual coding from a group of licensed and professional disease coders on a case-by-case basis, who spend a lot of time reading a multitude of medical materials. On the other hand, some other cases—especially outpatients—are coded by physicians.</p>
      <p>Since the conversion from ICD-9 to ICD-10 in 2014, Taiwan has used the ICD-10 as the reference for diagnostic-related group subsidy. However, because of the complexity of the ICD-10 structure and coding rules such as the code orders, the inclusion and exclusion criteria, and the enormously increasing number of ICD-10 codes, ICD-10 coding work became much more laborious and time-consuming, even if a disease coder with professional abilities takes approximately 30 minutes per case on average. According to the analysis from <italic>Handbook of Research on Informatics in Healthcare and Biomedicine</italic>, the cost for adopting the ICD-10 system, including training of disease coders, physicians, and code users; initial and long-term loss of productivity among providers; and sequential conversion, is estimated to range from a 1-time cost of US $425 million to US $1.15 billion in addition to US $5-40 million per year in lost productivity [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
      <p>Previous studies had built a model for the ICD-9 system. In 2008, Farkas and Szarvas [<xref ref-type="bibr" rid="ref3">3</xref>] utilized a rule-based approach querying other reference tools to implement the ICD auto-coding task. However, compared to ICD-9, ICD-10 contains more than 60,000 codes. Building a rule-based automatic system is labor-intensive and time-consuming. In addition, the entirety of the rules of the ICD-10 system is complicated even for disease coders. For the aforementioned reasons, recent studies have emphasized on deep learning– and natural language processing (NLP)–related approaches; for instance, Zhang et al [<xref ref-type="bibr" rid="ref4">4</xref>] used a gated recurrent unit (GRU) network with content-based attention to predict medication prescriptions on the basis of the disease codes, and Wang et al [<xref ref-type="bibr" rid="ref5">5</xref>] applied and compared NLP techniques such as Global Vectors (GloVe) in an electronic health record (EHR) data classification task.</p>
      <p>In previous studies, we have already applied word to vectors (Word2Vec), an NLP method, in an ICD-10 auto-coding task and achieved an F<sub>1</sub>-score of 0.67/0.58 in Clinical Modification (CM)/Procedure Coding System (PCS). Furthermore, we also built an ICD-10 code recommendation system for ICD-10 users [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. In this study, we made a comparison on most of the recent NLP approaches such as Word2Vec, embeddings from language models (ELMo), and bidirectional encoder representations from transformers (BERT). Furthermore, we introduced the attention mechanism to our classification model to visualize the word importance for training new coders in ICD-10 coding.</p>
      <p>In the ICD classification framework illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>, the left panel denotes the large amounts of free-text data written by physicians, which would be read and learned by the classifier in the right panel of the graph with supervised learning. Well-trained classifiers would be applied to predict the ICD-10 codes accurately for each patient. Furthermore, to distinguish the primary, secondary, or additional diagnosis, a sequential correction was conducted by coding the ICD-10 codes in a sequential format, using a sequence-to-sequence model followed by combining the classification coding results with the sequential order outcome.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Training and validation process for the ICD-10 classification and attention models. BBW: birth body weight; GA: gestational age; PRP: platelet-rich plasma.</p>
        </caption>
        <graphic xlink:href="medinform_v9i8e23230_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>The attention framework for paragraph highlighting is also illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>. Different from the classification framework, the input data in the left panel include both the diagnoses and the corresponding ICD-10 definitions from the National Health Insurance Administration rather than using merely the diagnoses, and the output data in the right panel is the attention weight matrix extracted from the predicting process rather than the classification result. With a combination of these 2 methods, we constructed an ICD-10 auto-coding and training system to assist ICD-10 code users.</p>
      <p>Our study aims at building an automatic ICD-10 coding and training system based on NLP technology, attention mechanism, and Deep Neural Network (DNN) models, which are applied for extracting information from EHR data, highlighting the key points from the extracted features, and implementing an ICD-10 classification task with sequential correction, respectively, for assisting all ICD-10 users.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Description</title>
        <p>Our data were acquired from patients at National Taiwan University Hospital (NTUH) from January 2016 to July 2018. The ground-truth ICD-10 codes were annotated by the coders at NTUH. Data attributes and types include account IDs, type contents, course and treatment, and discharge diagnoses. The distribution of ICD-10 codes is shown in our previous study [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      </sec>
      <sec>
        <title>System Architecture</title>
        <p>The entire process of the system constructing framework is composed of data processing, feature extracting, model constructing, model training, and web service building. To detail and visualize the ICD-10 web service clearly in this study, the complete workflow of the ICD-10 coding and training system is illustrated in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Complete framework of the ICD-10 auto-coding and training system. API: application programming interface; ICU: intensive care unit.</p>
          </caption>
          <graphic xlink:href="medinform_v9i8e23230_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Processing</title>
        <sec>
          <title>Preprocessing</title>
          <p>Preprocessing, including the removal of Chinese words, null or duplicate elements, punctuation, stop words, and infrequent words, was applied before tokenization of the texts. The basic preprocessing methods were applied using the Natural Language Toolkit [<xref ref-type="bibr" rid="ref8">8</xref>] and Scikit-Learn [<xref ref-type="bibr" rid="ref9">9</xref>] library. We then randomly split the data set at a 9:1 ratio into training and validation sets with the Scikit-Learn library.</p>
        </sec>
        <sec>
          <title>Postprocessing</title>
          <p>In ICD-10 coding, combination codes remain an intractable issue because, in some cases, disease coders cannot—and should not—assign multiple diagnosis codes when a single combination code clearly identifies all aspects of the patient’s diagnosis [<xref ref-type="bibr" rid="ref10">10</xref>]. In this study, a user-defining panel is provided in the auto-coding system to deal with combination codes by replacing the incorrect outcomes, where the combination codes were either predicted incorrectly or separated into 2 different codes on the basis of the given codes.</p>
        </sec>
      </sec>
      <sec>
        <title>Feature Extraction</title>
        <p>During feature extraction, we applied NLP techniques, including GloVe [<xref ref-type="bibr" rid="ref11">11</xref>], Word2Vec [<xref ref-type="bibr" rid="ref12">12</xref>], ELMo [<xref ref-type="bibr" rid="ref13">13</xref>], BERT [<xref ref-type="bibr" rid="ref14">14</xref>], and single head attention recurrent neural network (SHA-RNN), to convert the word contexts to numerical data and extract word and contextual information. Except for the BERT-based pretrained weight, we also attempted clinicalBERT [<xref ref-type="bibr" rid="ref15">15</xref>] and BioBERT [<xref ref-type="bibr" rid="ref16">16</xref>], which were trained with clinical notes from MIMIC-III, PubMed, and PubMed Central. Hyperparameters of the embedding models are attached in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Hyperparameters of word-embedding models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="470"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Hyperparameters</td>
                <td>Size/number</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">
                  <bold>Global Vector</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Word embedding size</td>
                <td>100</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Word to Vectors</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Word embedding size</td>
                <td>300</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Embeddings from Language Models</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Convolutional neural network char embedding size</td>
                <td>50</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Convolutional neural network word embedding size</td>
                <td>100</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Highway number</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Intermediate size</td>
                <td>512</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Bidirectional encoder representations from transformers<sup>a</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Word embedding size</td>
                <td>768</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sentence embedding size</td>
                <td>768</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Position embedding size</td>
                <td>768</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Intermediate size</td>
                <td>3072</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Attention head number</td>
                <td>12</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hidden layer number</td>
                <td>12</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Dropout</td>
                <td>0.1</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Single head attention recurrent neural network</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Word embedding size</td>
                <td>1024</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hidden size</td>
                <td>1024</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Layer number</td>
                <td>4</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Clinical bidirectional encoder representations from transformers (BERT) and BERT for biomedical text mining shared the same hyperparameters with BERT.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Classification Model</title>
        <p>The classification model was constructed with 4 neural network layers, including RNN and fully connected neural network (FCNN), where the hyperparameters are shown in <xref ref-type="table" rid="table2">Table 2</xref> and the architecture is shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. The first layer is the word embedding layer, which transforms the tokenized word list input into word vectors. The second layer is a bidirectional GRU (BiGRU) layer [<xref ref-type="bibr" rid="ref17">17</xref>]. The remaining 2 layers are fully connected layers, where the final fully connected layer should be set to the size of the dimension we expect to predict. In our case, we conducted 2 classification tasks, including whole label classification for CM and PCS with 14,602/9780 labels of CM/PCS in NTUH data records in total. Hence, the final fully connected layer size should be set to 14,602 and 9780 dimensions, respectively. To make a comparison, a classification model with only 1 fully connected layer— fully connected layer 2—was used as the baseline model. In addition, the attention mechanism based on the Bahdanau [<xref ref-type="bibr" rid="ref18">18</xref>] attention model was introduced to our classification model to further extract the keywords for ICD-10 coding by computing the weight information of context—ICD title–vector pairs; that is, the importance of the information with respect to the current target word.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Hyperparameters of the classification models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Hyperparameters</td>
                <td>Size</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Bidirectional GRU<sup>a</sup> layer</td>
                <td>256</td>
              </tr>
              <tr valign="top">
                <td>Fully connected layer 1</td>
                <td>700</td>
              </tr>
              <tr valign="top">
                <td>Fully connected layer 2 CM/PCS<sup>b</sup></td>
                <td>14,602/9780</td>
              </tr>
              <tr valign="top">
                <td>Dropout</td>
                <td>0.2</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>GRU: Gated Recurrent Unit.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>CM/PCS: Clinical Modification/Procedure Coding System.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Architecture of the Deep Neural Network classification model. BiGRU: Bidirectional Gated Recurrent Unit; GRU: Gated Recurrent Unit; PReLU: Parametric Rectified Linear Unit.</p>
          </caption>
          <graphic xlink:href="medinform_v9i8e23230_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Model Assessment</title>
        <p>Micro F<sub>1</sub>-score is the harmonic mean of recall and precision, which are the sum of the number of true-positive results divided by sum of the number of all positive results and the sum of the number of true-positive results divided by the sum of the number of all relevant samples, respectively. The micro F<sub>1</sub>-score considers the number for each label while calculating the recall and precision; hence, it is appropriate for evaluating the performance of a multi-label classification task with imbalanced data set.</p>
        <p>For realistic application in the auto-coding system, recall@K, which calculates the proportion of correct answers in the first K prediction results returned by the classifier, was also applied for validating the model’s performance. In our case, considering the limitation of the quantity of CM and PCS codes, 20 was chosen as the K value.</p>
      </sec>
      <sec>
        <title>ICD-10 Coding and Training System Framework</title>
        <p>An ICD-10 auto-coding and training system prototype was constructed with python3, ASP.NET Core 2.2 MVC, SQL Server, and Vue.js. Whenever a user performs an action, such as typing a discharge diagnosis or retrieving information from a database on the frontend interface built with Vue.js, the axios, a promise-based HTTP client for the browser and node.js, would call for the Web application programming interface in the backend built with ASP.NET Core 2.2 MVC to send the case information to the backend for predicting and processing via python3 or to the database for data preservation in SQL Server. The complete system framework is illustrated in <xref rid="figure4" ref-type="fig">Figure 4</xref>. In ICD-10 Coder and Trainer, with the discharge diagnosis as the data input, the top 20 related ICD-10-CM/PCS codes and the importance of each word related to the corresponding code would be returned to all ICD-10 users for auxiliary.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>System architecture of the ICD-10 auto coding and training web service. API: application programming interface.</p>
          </caption>
          <graphic xlink:href="medinform_v9i8e23230_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Comparing the Time Consumed and the F<sub>1</sub>-Score With and Without the Auto-Coding System</title>
        <p>We collected 60 discharge notes from February 2021 from the Far Eastern Memorial Hospital (New Taipei City, Taiwan) randomly. Nine coders participated in this experiment. The most experienced coder provided the ground truth. The other 8 coders were divided into 4 groups, and each case assigned to each group could be coded by 2 coders. There are 2 parts in this experiment. In part 1, we only provided medical record numbers, and the coders coded the randomly assigned medical records on a daily basis. Each group was assigned a different set of 10 cases. In part 2, we provided medical record numbers and ICD codes predicted by our best DNN classification model. Each group was randomly assigned 5 cases. We compared the time consumed and the F<sub>1</sub>-score between parts 1 and 2 and performed a paired samples Wilcoxon signed-rank test. A 2-tailed <italic>P</italic>&#60;.05 was considered significant. Furthermore, a questionnaire was designed to collect coders’ opinions on this system.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>ICD-10-CM Whole Label Classification</title>
        <p>In the NTUH data set, the complete ICD-10-CM codes (ie, CM codes with 3-7 characters) corresponding to the discharge diagnosis records comprise 14,602 labels in total. The best DNN classification model based on BERT embedding and FCNN with BiGRU could achieve an F<sub>1</sub>-score of 0.715 and recall@20 of 0.873. <xref ref-type="table" rid="table3">Table 3</xref> shows all comparisons of the whole label classification. Classification results with different BERT pretrained models show no significant effect on performance in both of baseline and BiGRU models.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>F<sub>1</sub>-score and Recall@20 of all embedding models in the International Classification of Diseases-10 Clinical Modification.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="310"/>
            <col width="250"/>
            <col width="220"/>
            <col width="220"/>
            <thead>
              <tr valign="top">
                <td>Embedding model</td>
                <td>Baseline F<sub>1</sub>-score</td>
                <td>F<sub>1</sub>-score</td>
                <td>Recall@20</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Word to Vectors</td>
                <td>0.355</td>
                <td>0.680</td>
                <td>0.873</td>
              </tr>
              <tr valign="top">
                <td>Global Vectors</td>
                <td>0.220</td>
                <td>0.635</td>
                <td>0.836</td>
              </tr>
              <tr valign="top">
                <td>Embeddings from Language Models</td>
                <td>0.633</td>
                <td>0.631</td>
                <td>0.852</td>
              </tr>
              <tr valign="top">
                <td>Bidirectional encoder representations from transformers–based</td>
                <td>0.715</td>
                <td>0.710</td>
                <td>0.869</td>
              </tr>
              <tr valign="top">
                <td>Clinical bidirectional encoder representations from transformers model</td>
                <td>0.712</td>
                <td>0.714</td>
                <td>0.869</td>
              </tr>
              <tr valign="top">
                <td>Bidirectional encoder representations from transformers for biomedical text mining</td>
                <td>0.709</td>
                <td>0.701</td>
                <td>0.863</td>
              </tr>
              <tr valign="top">
                <td>Single Head Attention Recurrent Neural Network</td>
                <td>0.402</td>
                <td>0.570</td>
                <td>0.835</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>ICD-10-PCS Whole Label Classification</title>
        <p>In the ICD-10-PCS whole label classification task, the complete ICD-10-PCS code (ie, PCS codes with 7 characters) corresponding to discharge diagnosis records comprised 9513 labels. Progress and discharge diagnosis were applied for training the DNN model. The results summarized in <xref ref-type="table" rid="table4">Table 4</xref> imply that our best DNN classification model based on BERT embedding and FCNN with BiGRU could achieve an F<sub>1</sub>-score of 0.618 and a recall@20 of 0.887.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>F<sub>1</sub>-score and recall@20 of all embedding models in the International Classification of Diseases-10 Procedure Coding System.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="550"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Embedding model</td>
                <td>Baseline F<sub>1</sub>-score</td>
                <td>F<sub>1</sub>-score</td>
                <td>Recall@20</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Word to Vectors</td>
                <td>0.278</td>
                <td>0.580</td>
                <td>0.850</td>
              </tr>
              <tr valign="top">
                <td>Global Vectors</td>
                <td>0.120</td>
                <td>0.520</td>
                <td>0.841</td>
              </tr>
              <tr valign="top">
                <td>Embeddings from Language Models</td>
                <td>0.547</td>
                <td>0.557</td>
                <td>0.874</td>
              </tr>
              <tr valign="top">
                <td>Bidirectional encoder representations from transformers–based</td>
                <td>0.618</td>
                <td>0.611</td>
                <td>0.880</td>
              </tr>
              <tr valign="top">
                <td>Clinical bidirectional encoder representations from transformers model</td>
                <td>0.596</td>
                <td>0.615</td>
                <td>0.887</td>
              </tr>
              <tr valign="top">
                <td>Bidirectional encoder representations from transformers for biomedical text mining</td>
                <td>0.611</td>
                <td>0.613</td>
                <td>0.880</td>
              </tr>
              <tr valign="top">
                <td>Single Head Attention Recurrent Neural Network</td>
                <td>0.269</td>
                <td>0.527</td>
                <td>0.879</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>ICD-10 Classification With Attention</title>
        <p>By introducing the attention mechanism into the classification model, the relation and importance between word pairs could be computed and visualized. For instance, for 2 sentences, “He had coronary artery disease. Also, he got fever.” and “A heart disease,” weight information for the word “heart” might focus on “coronary” or “artery.” Hence, by extracting the attention weights of the diagnoses and ICD-10 definitions, how coders focus on the words within diagnoses during the ICD-10 coding process could be well understood (<xref rid="figure5" ref-type="fig">Figure 5</xref>). Furthermore, the extracted diagnosis attention weights and the corresponding ICD-10 code could be visualized by highlighting the key words, the weight of which would be higher than a certain threshold, for training a new coder in disease coding. By considering all positive cases and negative sampling up to 40 cases in total, the classification model with the attention mechanism could achieve an F<sub>1</sub>-score of 0.86.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Visualization of attention weights.</p>
          </caption>
          <graphic xlink:href="medinform_v9i8e23230_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>ICD-10 Coding and Training System Framework</title>
        <p>The objective of this study is to build an ICD-10 auto-coding and training system for assisting disease coders to elevate their work efficiency and coding accuracy. An ICD-10 auto-predicting interface with discharge diagnosis as the reference is available on the internet [<xref ref-type="bibr" rid="ref19">19</xref>] for accelerating the coding efficiency. The DNN model executed by the python script would return the top 20 ICD-10-CM and ICD-10-PCS codes with a recall@20 of 0.87 and 0.88, respectively. The predicting process of each case takes less than 30 seconds, which drastically shortens the coding time of 30 minutes per case on average. In addition, training for ICD-10 coding is also provided under the training tab. Given a paragraph of discharge diagnosis, the key words to support the code could be highlighted by clicking on the target code.</p>
        <p>To make the prediction more flexible and adaptable to disease coders in different hospitals, postprocessing rules for dealing with exceptions, such as combination codes and hospital consensus, could be defined under the rule definition panel. Users could apply the default setting or build their own setting to apply the specific coding style. The ICD-10 auto-coding, training, and rule defining panels are shown in <xref rid="figure6" ref-type="fig">Figures 6</xref>, 7, and 8 respectively.</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>ICD-10 auto-coding panel.</p>
          </caption>
          <graphic xlink:href="medinform_v9i8e23230_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>ICD-10 auto-training panel.</p>
          </caption>
          <graphic xlink:href="medinform_v9i8e23230_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Postprocessing user defining panel.</p>
          </caption>
          <graphic xlink:href="medinform_v9i8e23230_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Time Consumed and F<sub>1</sub>-Score With and Without the Auto-Coding System</title>
        <p>The ICD-10 auto-coding system with our best DNN classification model significantly improved the coders’ mean F<sub>1</sub>-score from a median of 0.832 to 0.922 (<italic>P</italic>&#60;.05) but did not decrease their mean coding time (<italic>P</italic>=.64), as shown in <xref ref-type="table" rid="table5">Table 5</xref>. The questionnaire revealed that a coder took approximately 20-40 minutes on average to code a case, and 62.5% of coders are willing to use this system in their work. This system might potentially help them not only increase the accuracy of ICD-coding but also save their time.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Time consumed and the F<sub>1</sub>-score with and without the auto-coding system.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="250"/>
            <col width="250"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="bottom">
                <td>Coder</td>
                <td>Mean time consumed in part 1<sup>a,b</sup> (minutes:seconds)</td>
                <td>Mean time consumed in part 2<sup>c,d,e</sup> (minutes:seconds)</td>
                <td>Mean F<sub>1</sub>-score in part 1<sup>a,f</sup></td>
                <td>Mean F<sub>1</sub>-score in part 2<sup>c,g,h</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>07:49</td>
                <td>05:11</td>
                <td>0.801</td>
                <td>0.893</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>08:19</td>
                <td>06:01</td>
                <td>0.900</td>
                <td>0.960</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>04:57</td>
                <td>06:16</td>
                <td>0.980</td>
                <td>0.951</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>05:02</td>
                <td>07:32</td>
                <td>0.867</td>
                <td>0.950</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>06:23</td>
                <td>05:18</td>
                <td>0.766</td>
                <td>0.978</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>05:23</td>
                <td>03:53</td>
                <td>0.652</td>
                <td>0.892</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>05:45</td>
                <td>05:25</td>
                <td>0.815</td>
                <td>0.838</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>05:33</td>
                <td>06:43</td>
                <td>0.848</td>
                <td>0.827</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>Without the auto-coding system.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>Median time consumed in part 1=5 minutes 39 seconds (95% CI 5 minutes 1 second to 7 minutes 54 seconds).</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>With the auto-coding system.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>Median time consumed in part 2=5 minutes 43 seconds (95% CI 4 minutes 56 seconds to 6 minutes 52 seconds).</p>
            </fn>
            <fn id="table5fn5">
              <p><sup>e</sup>Nonsignificant difference in the mean time consumed by coders between parts 1 and 2 of the study (2-tailed <italic>P</italic>=.64 derived from a paired samples Wilcoxon signed-rank test).</p>
            </fn>
            <fn id="table5fn6">
              <p><sup>f</sup>Median F<sub>1</sub>-score in part 1=0.832 (95% CI 0.744-0.915).</p>
            </fn>
            <fn id="table5fn7">
              <p><sup>g</sup>Median F<sub>1</sub>-score in part 2=0.922 (95% CI 0.836-0.963).</p>
            </fn>
            <fn id="table5fn8">
              <p><sup>h</sup>Significant difference in mean F<sub>1</sub>-scores between parts 1 and 2 (2-tailed <italic>P</italic>&#60;.05 derived from a paired samples Wilcoxon rank sum test).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Compared to a previous study on ICD-9 classification with 85,522 training data and an F<sub>1</sub>-score of 0.41 [<xref ref-type="bibr" rid="ref20">20</xref>], our best DNN classification model based on the BERT embedding method and FCNN with BiGRU achieved an F<sub>1</sub>-score of 0.715 and recall@20 of 0.873. Comparing to the baseline model with only 1 fully connected layer, models with BiGRU showed better performance within the embedding approaches using fixed word embedding vectors. However, within embedding methods that are more flexible, such as BERT, the BiGRU classification model shows no significant effect on performance. This indicates that higher-level embedding techniques such as ELMo and BERT could certainly be able to sequentially consider the contextual semantics information; since they widely introduce the BiGRU and BiLSTM layers or other contextual information extraction methods within their model architectures. On the other hand, among all the embedding methods, BERT showed the best performance; however, it seems that initializing with different BERT pretrained weights has no significant influence on the classification results. However, the simplified BERT model SHA-RNN could only achieve 0.57 on the classification task and could not achieve over 0.41 on the baseline model. This might result from the lack of the corpus on training of the embedding model, comparing to BERT models which were trained with millions of articles from Bookcorpus, Wikipedia, etc; we only used our own discharge diagnosis records on SHA-RNN training. This implies the ability of the BERT model to learn and extract the information well in a specific field via only the fine-tuning process; thus, there is no need to train our BERT model from scratch with our own data set, but rather only to initialize with the pretrained weight and fine-tune with our own data set.</p>
        <p>Another previous study compared BERT with other DNNs in ICD-10 auto-coding in nontechnical summaries of animal experiments. They achieved a micro F<sub>1</sub>-score of 73.02% with BioBERT, which is comparable to our results [<xref ref-type="bibr" rid="ref21">21</xref>]. However, nontechnical summaries of animal experiments are not as complicated as the medical records we worked on and BioBERT could perform better than BERT in their data set, but no significant difference was observed in the medical records, as shown herein. Another study found that contextualized deep learning representation models including BERT and ELMo outperform noncontextualized representation models in discovering medical synonyms [<xref ref-type="bibr" rid="ref22">22</xref>], which is consistent with our findings.</p>
        <p>Our system improved the coder’s mean F<sub>1</sub>-score (<italic>P</italic>&#60;.05) but did not decrease the mean coding time (<italic>P</italic>=.64). One of the explanations is that coders had not become familiar with this system yet, and the other explanation is that relatively simple cases were included in this experiment, which led them to take less than 20-40 minutes per case during their daily work, as they indicated in their questionnaire responses. The long-term effect of the ICD-10 auto-coding system should be investigated in the future to determine whether the coding time can be saved.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our study has some limitations. First, our training data are derived from only 1 medical center. The performance in other hospitals could be affected by different writing habits, and different disease prevalence. Second, combination codes remain an intractable issue because in some cases, disease coders cannot and should not assign multiple diagnosis codes in cases where a single combination code clearly identifies all aspects of the patient’s diagnosis. In our results, the combination codes were either predicted incorrectly or separated into 2 different codes. In addition, there are multiple diagnoses that corresponded to multiple codes in order; that is, primary diagnosis, secondary diagnosis, tertiary diagnosis, etc [<xref ref-type="bibr" rid="ref10">10</xref>]. However, the classification model could only give the probability of each code rather than the corresponding order. To resolve the problem while maintaining high performance in the classification task, we proposed a novel approach by combining the Seq2Seq model, which gives the code order. Finally, our system is still new to coders, and few coders have used it. After more users’ responses are collected, further analysis and modification can be performed to improve our system.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, an ICD-10 classification model developed using NLP and a deep learning model without any background knowledge from EHR data yielded an F<sub>1</sub>-score of 0.715 and 0.618 for CM and PCS, respectively. In addition, we built and released the platform for automated ICD-10 prediction and training based on our well-trained models for free to ICD-10 users worldwide and further shortened the coding time from 20-40 minutes to 30 seconds per case. Our platform can be found on the internet [<xref ref-type="bibr" rid="ref19">19</xref>]. Our system can significantly improve coders’ F<sub>1</sub>-score in ICD-10 coding.</p>
        <p>In future studies, we shall attempt to develop and provide other functions such as user feedback and auto-training with new input data to our model. ICD-10 codes in different hospitals with different coding styles will also be constructed in accordance with the amount of user information and prediction history records to improve the automated ICD-10 coding and training system further.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BiGRU</term>
          <def>
            <p>Bidirectional Gated Recurrent Unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BioBERT</term>
          <def>
            <p>bidirectional encoder representations from transformers for biomedical text mining</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CM</term>
          <def>
            <p>Clinical Modification</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DNN</term>
          <def>
            <p>Deep Neural Network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">EHR</term>
          <def>
            <p>Electronic Health Records</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">ELMo</term>
          <def>
            <p>Embeddings from Language Models</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">FCNN</term>
          <def>
            <p>fully-connected neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">GloVe</term>
          <def>
            <p>Global Vectors</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">GRU</term>
          <def>
            <p>Gated Recurrent Unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">ICD</term>
          <def>
            <p>International Classification of Diseases</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">NTUH</term>
          <def>
            <p>National Taiwan University Hospital</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">PCS</term>
          <def>
            <p>Procedure Coding System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SHA-RNN</term>
          <def>
            <p>Single Head Attention Recurrent Neural Network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">Word2Vec</term>
          <def>
            <p>Word to Vectors</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was supported by grants from the Ministry of Science and Technology, Taiwan (MOST 110-2634-F-002-032-).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>FL and SMW designed the study. SMW and WCL designed and developed the system. PFC, LCK, KCC, YCL, CYY, CHC, and SCC collected the data. PFC and KCC conducted the experiment. SMW, WCL, and PFC conducted statistical analyses and drafted the manuscript. All authors reviewed the final manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>The International Classification of Diseases, 10th Revision</article-title>
          <source>World Health Organization</source>
          <year>2015</year>
          <access-date>2021-08-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://icd.who.int/browse10/2015/en">https://icd.who.int/browse10/2015/en</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lazakidou</surname>
              <given-names>AA</given-names>
            </name>
          </person-group>
          <source>Handbook of Research on Informatics in Healthcare and Biomedicine</source>
          <year>2006</year>
          <publisher-loc>Hershey, PA</publisher-loc>
          <publisher-name>IGI Global</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Farkas</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Szarvas</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Automatic construction of rule-based ICD-9-CM coding systems</article-title>
          <source>BMC Bioinformatics</source>
          <year>2008</year>
          <month>04</month>
          <day>11</day>
          <volume>9 Suppl 3</volume>
          <fpage>S10</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S3-S10"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-9-S3-S10</pub-id>
          <pub-id pub-id-type="medline">18426545</pub-id>
          <pub-id pub-id-type="pii">1471-2105-9-S3-S10</pub-id>
          <pub-id pub-id-type="pmcid">PMC2352868</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>LEAP: Learning to Prescribe Effective and Safe Treatment Combinations for Multimorbidity</article-title>
          <year>2017</year>
          <conf-name>The 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>August 13-17, 2017</conf-date>
          <conf-loc>Halifax, NS</conf-loc>
          <fpage>1315</fpage>
          <lpage>1324</lpage>
          <pub-id pub-id-type="doi">10.1145/3097983.3098109</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>MedSTS: a resource for clinical semantic textual similarity</article-title>
          <source>Lang Resour Eval</source>
          <year>2018</year>
          <month>10</month>
          <day>24</day>
          <volume>54</volume>
          <issue>1</issue>
          <fpage>57</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.1007/s10579-018-9431-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>ICD-10 Auto-coding System Using Deep Learning</article-title>
          <year>2020</year>
          <conf-name>10th International Workshop on Computer Science and Engineering</conf-name>
          <conf-date>June 19-21, 2020</conf-date>
          <conf-loc>Shanghai</conf-loc>
          <fpage>46</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="doi">10.18178/wcse.2020.02.008</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>YH</given-names>
            </name>
            <name name-style="western">
              <surname>Kuo</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>YN</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>FY</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Using Deep Learning for Automatic Icd-10 Classification from FreeText Data</article-title>
          <source>Eur J Biomed Inform</source>
          <year>2020</year>
          <volume>16</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ejbi.org/scholarly-articles/using-deep-learning-for-automatic-icd10-classification-from-freetext-data.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.24105/ejbi.2020.16.1.1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Loper</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bird</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>NLTK: the Natural Language Toolkit</article-title>
          <year>2002</year>
          <conf-name>ACL-02 Workshop on Effective tools and methodologies for teaching natural language processing and computational linguistics</conf-name>
          <conf-date>July 7, 2002</conf-date>
          <conf-loc>Philadelphia, PA</conf-loc>
          <pub-id pub-id-type="doi">10.3115/1118108.1118117</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Prettenhofer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dubourg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vanderplas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Passos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn: Machine Learning in Python</article-title>
          <source>J Mach Learn Res</source>
          <year>2011</year>
          <volume>12</volume>
          <fpage>2825</fpage>
          <lpage>2830</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="web">
          <article-title>International Classification of Diseases, Tenth Revision, Clinical Modification</article-title>
          <source>Qeios</source>
          <access-date>2021-08-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.qeios.com/read/SA6DYU">https://www.qeios.com/read/SA6DYU</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>GloVe: Global Vectors for Word Representation</article-title>
          <year>2014</year>
          <conf-name>2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>October 2014</conf-date>
          <conf-loc>Doha</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed Representations of Words and Phrases and their Compositionality</article-title>
          <source>arXiv.</source>
          <comment>Preprint posted online October 16, 2013.
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1310.4546"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Grus</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tafjord</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Dasigi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schmitz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>AllenNLP: A Deep Semantic Natural Language Processing Platform</article-title>
          <year>2018</year>
          <conf-name>Workshop for NLP Open Source Software (NLP-OSS)</conf-name>
          <conf-date>July 2018</conf-date>
          <conf-loc>Melbourne</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w18-2501</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</article-title>
          <year>2019</year>
          <conf-name>17th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2019)</conf-name>
          <conf-date>June 2-7, 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423
</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Altosaar</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ranganath</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>ClinicalBERT: Modeling Clinical Notes and Predicting Hospital Readmission</article-title>
          <year>2019</year>
          <conf-name>ACM Conference on Health, Inference, and Learning</conf-name>
          <conf-date>April 2-4, 2020</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
          <pub-id pub-id-type="doi">10.1090/mbk/121/79</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>1240</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gulcehre</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling</article-title>
          <source>arXiv.</source>
          <comment>Preprint posted online December 11, 2014.
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1412.3555"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bahdanau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Neural Machine Translation by Jointly Learning to Align and Translate</article-title>
          <source>arXiv.</source>
          <comment>Preprint posted online September 1, 2014.
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1409.0473"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <source>ICD Web</source>
          <access-date>2021-08-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://nets.csie.ntu.edu.tw/">https://nets.csie.ntu.edu.tw/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Oren</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Hajaj</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hardt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Marcus</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sundberg</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Flores</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Duggan</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Irvine</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Litsch</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mossin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tansuwan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wexler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ludwig</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Volchenboum</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pearson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Madabushi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Butte</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Howell</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Scalable and accurate deep learning with electronic health records</article-title>
          <source>NPJ Digit Med</source>
          <year>2018</year>
          <volume>1</volume>
          <fpage>18</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-018-0029-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-018-0029-1</pub-id>
          <pub-id pub-id-type="medline">31304302</pub-id>
          <pub-id pub-id-type="pii">29</pub-id>
          <pub-id pub-id-type="pmcid">PMC6550175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Amin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dunfield</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Vechkaeva</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Wixted</surname>
              <given-names>MK</given-names>
            </name>
          </person-group>
          <article-title>MLT-DFKI at CLEF eHealth 2019: Multi-label classification of ICD-10 codes with BERT</article-title>
          <year>2019</year>
          <conf-name>10th Conference and Labs of the Evaluation Forum</conf-name>
          <conf-date>September 9-12, 2019</conf-date>
          <conf-loc>Lugano</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://ceur-ws.org/Vol-2380/paper_67.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schumacher</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Learning unsupervised contextual representations for medical synonym discovery</article-title>
          <source>JAMIA Open</source>
          <year>2019</year>
          <month>12</month>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>538</fpage>
          <lpage>546</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32025651"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooz057</pub-id>
          <pub-id pub-id-type="medline">32025651</pub-id>
          <pub-id pub-id-type="pii">ooz057</pub-id>
          <pub-id pub-id-type="pmcid">PMC6994012</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
