<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v13i1e63020</article-id>
      <article-id pub-id-type="pmid">39761555</article-id>
      <article-id pub-id-type="doi">10.2196/63020</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Autonomous International Classification of Diseases Coding Using Pretrained Language Models and Advanced Prompt Learning Techniques: Evaluation of an Automated Analysis System Using Medical Text</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Castonguay</surname>
            <given-names>Alexandre</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mao</surname>
            <given-names>Siqi</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Elbattah</surname>
            <given-names>Mahmoud</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Xiang</surname>
            <given-names>Suncheng</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Zhuang</surname>
            <given-names>Yan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3483-0988</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Junyan</given-names>
          </name>
          <degrees>ME</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9140-901X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Xiuxing</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1178-7422</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Chao</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8960-661X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Yue</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-3697-1405</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Dong</surname>
            <given-names>Wei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4525-1105</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>He</surname>
            <given-names>Kunlun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Medical Big Data Research Center</institution>
            <institution>Chinese PLA General Hospital</institution>
            <addr-line>28 Fuxing Road</addr-line>
            <addr-line>Beijing, 100853</addr-line>
            <country>China</country>
            <phone>86 13911232619</phone>
            <email>kunlunhe@plagh.org</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3335-5700</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Medical Big Data Research Center</institution>
        <institution>Chinese PLA General Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>School of Computer Science &#38; Technology</institution>
        <institution>Beijing Institute of Technology</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Digital Health China Technologies Co Ltd</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Senior Department of Cardiology</institution>
        <institution>The Sixth Medical Center of PLA General Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Kunlun He <email>kunlunhe@plagh.org</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>6</day>
        <month>1</month>
        <year>2025</year>
      </pub-date>
      <volume>13</volume>
      <elocation-id>e63020</elocation-id>
      <history>
        <date date-type="received">
          <day>11</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>27</day>
          <month>8</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>16</day>
          <month>10</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>19</day>
          <month>11</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Yan Zhuang, Junyan Zhang, Xiuxing Li, Chao Liu, Yue Yu, Wei Dong, Kunlun He. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 06.01.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2025/1/e63020" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Machine learning models can reduce the burden on doctors by converting medical records into International Classification of Diseases (ICD) codes in real time, thereby enhancing the efficiency of diagnosis and treatment. However, it faces challenges such as small datasets, diverse writing styles, unstructured records, and the need for semimanual preprocessing. Existing approaches, such as naive Bayes, Word2Vec, and convolutional neural networks, have limitations in handling missing values and understanding the context of medical texts, leading to a high error rate. We developed a fully automated pipeline based on the Key–bidirectional encoder representations from transformers (BERT) approach and large-scale medical records for continued pretraining, which effectively converts long free text into standard ICD codes. By adjusting parameter settings, such as mixed templates and soft verbalizers, the model can adapt flexibly to different requirements, enabling task-specific prompt learning.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to propose a prompt learning real-time framework based on pretrained language models that can automatically label long free-text data with ICD-10 codes for cardiovascular diseases without the need for semiautomatic preprocessing.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We integrated 4 components into our framework: a medical pretrained BERT, a keyword filtration BERT in a functional order, a fine-tuning phase, and task-specific prompt learning utilizing mixed templates and soft verbalizers. This framework was validated on a multicenter medical dataset for the automated ICD coding of 13 common cardiovascular diseases (584,969 records). Its performance was compared against robustly optimized BERT pretraining approach, extreme language network, and various BERT-based fine-tuning pipelines. Additionally, we evaluated the framework’s performance under different prompt learning and fine-tuning settings. Furthermore, few-shot learning experiments were conducted to assess the feasibility and efficacy of our framework in scenarios involving small- to mid-sized datasets.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Compared with traditional pretraining and fine-tuning pipelines, our approach achieved a higher micro–F1-score of 0.838 and a macro–area under the receiver operating characteristic curve (macro-AUC) of 0.958, which is 10% higher than other methods. Among different prompt learning setups, the combination of mixed templates and soft verbalizers yielded the best performance. Few-shot experiments showed that performance stabilized and the AUC peaked at 500 shots.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>These findings underscore the effectiveness and superior performance of prompt learning and fine-tuning for subtasks within pretrained language models in medical practice. Our real-time ICD coding pipeline efficiently converts detailed medical free text into standardized labels, offering promising applications in clinical decision-making. It can assist doctors unfamiliar with the ICD coding system in organizing medical record information, thereby accelerating the medical process and enhancing the efficiency of diagnosis and treatment.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>BERT</kwd>
        <kwd>bidirectional encoder representations from transformers</kwd>
        <kwd>pretrained language models</kwd>
        <kwd>prompt learning</kwd>
        <kwd>ICD</kwd>
        <kwd>International Classification of Diseases</kwd>
        <kwd>cardiovascular disease</kwd>
        <kwd>few-shot learning</kwd>
        <kwd>multicenter medical data</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>The <italic>International Classification of Diseases, 10th Revision</italic> (<italic>ICD-10</italic>), is a universally recognized diagnostic categorization system widely used in medical insurance reimbursements, health reporting, mortality assessments, and related fields [<xref ref-type="bibr" rid="ref1">1</xref>]. The <italic>ICD-10</italic>’s automatic coding mechanism enables rapid and accurate classification and statistical analysis of medical data, offering a scientific foundation for effective hospital administration and decision-making. In addition, the <italic>ICD-10</italic> automatic coding system accelerates disease diagnosis and treatment planning for medical practitioners, thereby improving medical efficacy and quality. Compared with the original <italic>ICD</italic> code, <italic>ICD-10</italic> provides over 14,000 distinct disease codes (in contrast to the thousands in <italic>ICD-9</italic>), enabling more detailed disease classification. This comprehensive system offers clinicians enhanced patient information, supporting the development of more precise treatment plans and care programs, ultimately improving the quality of care and patient satisfaction. Moreover, as an internationally standardized code, <italic>ICD-10</italic> is essential for global public health surveillance, epidemiological research, and international medical cooperation. Consequently, ensuring accurate <italic>ICD</italic> coding remains a critical priority in clinical practice.</p>
        <p>In hospital settings, the assignment of <italic>ICD</italic> codes to unstructured clinical narratives in medical records is a manual task performed by skilled medical coders based on the attending physician’s clinical diagnosis. Despite its critical importance, this process is often hindered by inefficiencies such as time consumption, susceptibility to errors, and high costs. Additionally, manual coding cannot always ensure the accuracy of <italic>ICD</italic> codes due to the complexity of code assignment, which requires a thorough consideration of the patient’s overall health condition, including medical history, coexisting conditions, complications, surgical interventions, and specialized diagnostic procedures [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      </sec>
      <sec>
        <title>Machine Learning Techniques</title>
        <p>The need to enhance efficiency and reduce errors has driven the development of various machine learning techniques to automate the medical <italic>ICD</italic> coding process. These techniques can be broadly classified into 4 main categories: rule-based systems [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], traditional supervised algorithms [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], gate unit–based deep learning approaches [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>], and pretrained language models (PLMs) [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>First, rule-based systems for automatic <italic>ICD</italic> coding rely on the creation of explicit rules and knowledge bases to map medical records to the appropriate <italic>ICD</italic> codes [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Although these approaches have been used for decades and have provided a foundation for more advanced techniques, they are limited by their lack of adaptability and scalability.</p>
        <p>Second, traditional supervised algorithms, such as gradient-boosted trees, have been utilized for <italic>ICD</italic> coding due to their efficiency in handling large-scale, high-dimensional datasets. These algorithms rely on semistructured preprocessing, which involves organizing and refining semistructured data into a format suitable for analysis [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. For example, Diao et al [<xref ref-type="bibr" rid="ref6">6</xref>] developed a light gradient boosting machine–based pipeline for automatically coding 168 primary diagnosis <italic>ICD-10</italic> codes from discharge records and procedure texts, achieving an accuracy of 95.2%. Another study integrated long short-term memory networks with attention mechanisms to predict mortality in ICU patients using electronic health records, achieving significantly higher area under the receiver operating characteristic curve (AUC) scores compared with traditional statistical models and stand-alone long short-term memory networks [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p>Third, PLMs are neural network models with fixed architectures trained on large corpora, which can be fine-tuned for specific downstream tasks such as question answering and entity recognition [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. A notable example is bidirectional encoder representations from transformers (BERT), a prominent PLM designed to learn deep bidirectional representations from large-scale unlabeled text data. BERT effectively captures semantic relationships in clinical records and can be easily adapted to various natural language processing (NLP) tasks through task-specific layers [<xref ref-type="bibr" rid="ref13">13</xref>]. Coutinho and Martins [<xref ref-type="bibr" rid="ref14">14</xref>] proposed a BERT model with a fine-tuning method for automatic <italic>ICD-10</italic> coding of death certificates based on free-text descriptions and associated documents. Additionally, Yan et al [<xref ref-type="bibr" rid="ref15">15</xref>] introduced RadBERT, an ensemble model combining BERT-base, Clinical-BERT, the robustly optimized BERT pretraining approach (RoBERTa), and BioMed-RoBERTa adapted for radiology. Liu et al [<xref ref-type="bibr" rid="ref16">16</xref>] evaluated RadBERT across 3 NLP tasks: abnormal sentence classification, report coding, and report summarization, demonstrating significantly better performance compared with existing transformer language models. Unstructured patient-generated health data can be leveraged to support clinical decision-making, remote monitoring, and self-care, including medication adherence and chronic disease management. By applying named entity recognition and customizable information extraction methods based on medical ontologies, NLP models can extract a wide range of clinical information from unstructured patient-generated health data, even in low-resource settings with limited patient notes or training data [<xref ref-type="bibr" rid="ref17">17</xref>]. Textual analysis presents numerous opportunities for future medical applications. It can aid in extracting information from various sources of medical data, such as clinical reports, nursing notes, scientific literature, and user-generated content. Additionally, vector-based representation methods can transform textual data within clinical documents into formats suitable for machine learning and can be applied to sequence modeling tasks, including sentiment analysis [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
        <p>Finally, XLNet is another type of PLM that captures both forward and backward contexts of text [<xref ref-type="bibr" rid="ref19">19</xref>]. It combines the advantages of autoregressive models and autoencoding models while overcoming their limitations. XLNet utilizes a permutation-based objective function that maximizes the expected likelihood of a text across all possible word orderings. It also incorporates the Transformer-XL (Transformer-Extra-Long) architecture, which enables long-term dependency modeling and improved memory efficiency. XLNet has been shown to outperform BERT and other baseline models on several natural language understanding tasks.</p>
      </sec>
      <sec>
        <title>Prompt Engineering Techniques</title>
        <p>By contrast, prompt engineering is a technique that involves the careful construction of prompts or inputs for artificial intelligence models to improve their performance on specific tasks. This technique includes selecting appropriate words, phrases, symbols, and formats to guide a large language model in generating high-quality and relevant text. Numerous studies have used prompts for model tuning to bridge the gap between pretraining objectives and downstream tasks, demonstrating that both discrete and continuous prompts can improve performance in few-shot and zero-shot tasks [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Furthermore, this technique within PLMs has been shown to outperform fine-tuning in various clinical decision-making tasks [<xref ref-type="bibr" rid="ref22">22</xref>]. It has the advantage of requiring less data and computational resources, making it especially suitable for clinical settings.</p>
        <p>There are 2 primary categories of prompting methods: hard prompts and soft prompts [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. Hard prompts involve using an actual text string as the prompt and include methods that automatically search for templates within a discrete space, such as mining-based, paraphrasing-based, and gradient-based approaches [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. The advantages of hard prompts are interpretability, portability, flexibility, and simplicity. However, designing effective prompts for specific tasks requires significant effort and creativity.</p>
        <p>Soft prompts, by contrast, are learnable tensors concatenated with the input embeddings and can be optimized for a given dataset. The main advantage of soft prompts is their ability to achieve better performance than hard prompts by adapting to the model and the data. However, they are not human-readable and lack portability across different models.</p>
        <p>Prefix tuning and P-tuning are 2 methods of prompt engineering that can enhance performance beyond traditional fine-tuning [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]. Prefix tuning is a lightweight approach that keeps the PLM parameters unchanged while optimizing a sequence of task-specific vectors called the prefix [<xref ref-type="bibr" rid="ref23">23</xref>]. This prefix is added to the input and interacts with the model’s hidden states at each layer. Its success depends on how effectively the prefix is initialized, particularly when data are limited. P-tuning is another prompt tuning strategy that performs comparably to fine-tuning across various tasks [<xref ref-type="bibr" rid="ref24">24</xref>]. It reduces the number of PLM parameters through self-adaptive pruning and tunes a small number of continuous prompts at the beginning of each transformer layer.</p>
        <p>The verbalizer is the final layer that defines the answer space and maps it to the target output. Typically, verbalizers are manually created, which can limit their coverage due to personal vocabulary biases [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. To address this, some studies have proposed automatic verbalizer search methods to identify more effective verbalizers, also known as soft verbalizers [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref32">32</xref>].</p>
      </sec>
      <sec>
        <title>Autonomous ICD Coding in Cardiovascular Disease</title>
        <p>Cardiovascular disease (CVD) is currently a leading cause of death worldwide, posing a significant risk of mortality among patients [<xref ref-type="bibr" rid="ref7">7</xref>]. Automatically labeling patients with CVD is essential for clinical decision-making and resource allocation. However, existing prediction models have limitations, including low accuracy, limited generalizability, and an inability to capture multicenter data. To address these challenges, we propose a prompt learning real-time framework based on PLMs that can automatically label long free-text data with <italic>ICD-10</italic> codes for CVDs without the need for semiautomatic preprocessing.</p>
        <p>Our framework consists of 4 components: a medically oriented pretrained BERT, a keyword filtration BERT, a fine-tuning phase, and task-specific prompt learning facilitated by mixed templates and soft verbalizers. To validate the efficacy of our framework, we conducted comprehensive evaluations on a Chinese multicenter cardiovascular dataset that includes data from 13,000 patients with CVD. This deliberate choice of dataset ensures the robustness and wide applicability of our framework. We compared our framework with RoBERTa, XLNet, and various BERT-based fine-tuning pipelines to highlight its performance. Additionally, we conducted few-shot experiments to demonstrate its resilience. This work promises to provide valuable insights into enhancing medical knowledge extraction and its effective application, underscoring the need for continued research and development in this promising area. In future work, we plan to implement this fully automated <italic>ICD</italic> coding pipeline across various clinical applications, including clinical decision support systems, cohort studies, and disease early warning and diagnosis systems.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethical Considerations</title>
        <p>The study was approved by the Ethics Committee of the Chinese PLA General Hospital (S2023-325-02). Ethical approval included a waiver for obtaining informed consent signatures from participants. The study posed no potential harm to participants and did not involve any compensation for their participation. To protect patient privacy, we used regular expressions to parse and redact basic identifying information from the medical records. As these records were created using a standardized template, we ensured that the excerpts extracted for this study did not contain patients’ names.</p>
      </sec>
      <sec>
        <title>Overview</title>
        <p>The overall framework of the model is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. We used a corpus dataset of 575,632 clinical notes to continue training the original BERT model, which we named medical domain refinement-BERT (MDR-BERT), as the PLM for our work. For the classification task, we first applied Key-BERT to filter the discharge summaries. This method extracts keywords and splits long free-text data into shorter sentences.</p>
        <p>We then constructed the input template for fine-tuning and prompt learning using 3 components: the soft prompt, the manual prompt, and the mask part. The manual prompt was a handcrafted text prompt containing discrete tokens. The soft prompt was a learnable pseudo-token with a few continuous parameters. The mask part represented the <italic>ICD</italic> coding label. Finally, we used a trainable soft verbalizer to compute and apply the softmax function to the probabilities of the <italic>ICD</italic> classes, producing the output. By designing specific prompts, it is possible to incorporate the knowledge of medical experts into the model, helping it better understand and perform <italic>ICD</italic> coding. These prompts can direct the model to focus on critical sections of the input text, thereby enhancing performance.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Overall framework of MDR-BERT, Key-BERT, and prompt learning pipeline. BERT: bidirectional encoder representations from transformers; ICD: International Classification of Diseases; MDR: medical domain refinement.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63020_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Dataset Characteristics</title>
        <p>The cardiovascular dataset used in this study was obtained from the Cardiovascular Department of the Chinese PLA General Hospital’s Medical Big Data Research Center in Beijing, China, which includes 9 medical centers with data aggregated into a comprehensive medical big data platform. Additionally, the hospital is a key center for the treatment of CVDs, with numerous specialized physicians and detailed medical records, making its data highly practical and representative. To ensure privacy, patient names and addresses were desensitized. The data platform consists of electronic health records aggregated from 8 affiliated medical centers. A total of 584,969 clinical notes with structured <italic>ICD</italic> labels were extracted from admission records and discharge summaries in the Cardiovascular Department. We ensured that each diagnosis included at least 50 cases and adopted a stratified sampling approach to divide each disease category into training, validation, and test sets in a 3:1:1 ratio. The detailed distribution and basic statistical information of the dataset are shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Distribution and basic statistical information of the data set. ICD: International Classification of Diseases.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63020_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Based on the long-tailed distribution and clinician selection, 13 diseases were chosen for classification. These diseases include atrial fibrillation, acute myocardial infarction, infective endocarditis, acute left heart failure, acute coronary syndrome (ACS), acute aortic dissection, hypertensive emergency, acute pulmonary embolism, acute myocarditis, ventricular tachycardia, cardiogenic shock, acute heart failure, and third-degree atrioventricular block. The corresponding <italic>ICD-10</italic> codes and abbreviations for these diseases are listed in <xref ref-type="table" rid="table1">Table 1</xref>. Despite the disparity in the number of cases for different diseases, the imbalance inherent in medical data accurately reflects real-world conditions, taking into account the clinical insights of medical professionals. This imbalance represents the varying frequency at which different diseases occur in clinical practice. By preserving the raw data distribution and avoiding artificial balancing, our training approach aligns more closely with real-world medical practice. As a result, this enhances the model’s generalization ability and its applicability in practical scenarios.</p>
        <p>To ensure task independence and prevent data leakage, all clinical notes were divided into 2 parts: the pretraining corpus dataset and the <italic>ICD</italic> coding dataset. The pretraining corpus consisted of a total of 575,632 notes, while the <italic>ICD</italic> coding dataset included 9337 discharge records. The data were stratified by imbalanced <italic>ICD</italic> labels and randomly split into training, validation, and test sets in a 3:1:1 ratio. The sample sizes were as follows: 5734 in the training set, 1913 in the validation set, and 2007 in the test set. We applied regularization to truncate patients’ basic information, as this information could negatively impact the model’s fitting.</p>
        <p>As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, the distribution of the 13 <italic>ICD</italic> codes was imbalanced and exhibited a long-tail pattern. The dataset for <italic>ICD</italic> classification contains a total of 4.574 × 10<sup>7</sup> words, with an average of 490 words per note. The maximum and minimum lengths of the clinical notes are 5243 and 22 words, respectively.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Overview of target International Classification of Diseases (ICD) codes and disease names.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td><italic>International Classification of Diseases</italic> code</td>
                <td>Disease (abbreviation)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>I48.0</td>
                <td>Atrial fibrillation (AF)</td>
              </tr>
              <tr valign="top">
                <td>I21.9</td>
                <td>Acute myocardial infarction (AMI)</td>
              </tr>
              <tr valign="top">
                <td>I33.0</td>
                <td>Infective endocarditis (IE)</td>
              </tr>
              <tr valign="top">
                <td>I50.1</td>
                <td>Acute left heart failure (ALHF)</td>
              </tr>
              <tr valign="top">
                <td>I20.9</td>
                <td>Acute coronary syndrome (ACS)</td>
              </tr>
              <tr valign="top">
                <td>I71.0</td>
                <td>Acute aortic dissection (AAD)</td>
              </tr>
              <tr valign="top">
                <td>I10.1</td>
                <td>Hypertensive emergency (HE)</td>
              </tr>
              <tr valign="top">
                <td>I26.0</td>
                <td>Acute pulmonary embolism (APE)</td>
              </tr>
              <tr valign="top">
                <td>I51.4</td>
                <td>Acute myocarditis (AM)</td>
              </tr>
              <tr valign="top">
                <td>I47.2</td>
                <td>Ventricular tachycardia (VT)</td>
              </tr>
              <tr valign="top">
                <td>R57.0</td>
                <td>Cardiogenic shock (CS)</td>
              </tr>
              <tr valign="top">
                <td>I50.2</td>
                <td>Acute heart failure (AHF)</td>
              </tr>
              <tr valign="top">
                <td>I44.2</td>
                <td>Third-degree atrioventricular block (TAB)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Distribution of ICD codes for the triage task. ICD: International Classification of Diseases.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63020_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Pretraining</title>
        <p>Our study’s foundational framework is based on BERT, a multilayer bidirectional transformer encoder known for its conceptual simplicity and empirical effectiveness [<xref ref-type="bibr" rid="ref33">33</xref>]. This architecture consists of 12 layers, a hidden size dimension of 768, and 12 self-attention heads [<xref ref-type="bibr" rid="ref13">13</xref>]. BERT’s inherent self-attention mechanism provides the versatility to handle various downstream tasks by allowing the interchange of relevant inputs and outputs, making it well-suited for our task involving <italic>ICD</italic> classification through clinical records.</p>
        <p>To adapt BERT to the specific requirements of our task, we continued training the PLM using an extensive medical corpus, resulting in MDR-BERT. During the tuning process, we selected a batch size of 32, considering the constraint of a maximum sequence length of 512 tokens. The Adam optimization algorithm was used with a conservative learning rate of 2 × 10<sup>–5</sup>. The training was carried out over 15 epochs, an empirically determined figure based on the characteristics of the clinical dataset.</p>
      </sec>
      <sec>
        <title>Key-BERT</title>
        <p>The Key-BERT method offers a novel self-supervised framework for extracting keywords and keyphrases from textual content using deep learning techniques [<xref ref-type="bibr" rid="ref34">34</xref>]. This approach leverages the contextual and semantic features provided by bidirectional transformers, with a particular focus on the influential BERT model. The method’s architecture is designed for end-to-end training, utilizing a contextually self-annotated corpus that enables the model to develop a nuanced understanding of the complex relationships between words and their semantic meanings. In the <italic>ICD</italic> coding task, Key-BERT leverages BERT’s context-aware capabilities to extract keywords from the document, quickly identify the sections relevant to <italic>ICD</italic> coding, and reduce the risk of miscoding caused by misinterpreting or overlooking critical information in the text.</p>
        <p>A distinctive feature of Key-BERT lies in its automated keyword labeling process. This process effectively utilizes contextual insights from bidirectional transformers to construct a carefully curated ground truth dataset. This approach bypasses the labor-intensive task of manual labeling and eliminates the need for domain-specific expertise.</p>
        <p>The repository of self-labeled data generated by Key-BERT is partially shared with the NLP community, contributing to a deeper and more comprehensive understanding of keyword extraction techniques across various domains. This collaborative effort enhances the landscape of knowledge and expertise, driving advancements in the field of NLP and semantic information extraction.</p>
        <p>To extract keywords using Key-BERT, the contextual feature vector for each word in a sentence is obtained by passing the sentence through the pretrained BERT model. Let <italic>S</italic> = [<italic>w<sub>1</sub></italic>, <italic>w<sub>2</sub></italic>, ..., <italic>w<sub>n</sub></italic>] be a sentence consisting of <italic>n</italic> words, where <italic>w<sub>i</sub></italic> is the <italic>i</italic>th word in the sentence and <italic>E<sub>i</sub></italic> is the contextual feature vector of the <italic>i</italic>th word in the sentence. The sentence embedding vector, denoted as <italic>E<sub>s</sub></italic>, is obtained by averaging the feature vectors of all the words in the sentence:</p>
        <disp-formula>E<sub>i</sub> = BERT_Embedding(w<sub>i</sub>) <bold>(1)</bold></disp-formula>
        <disp-formula>E<sub>s</sub> = (E<sub>1</sub> + E<sub>2</sub> + ···+ E<sub>n</sub>)/(n) <bold>(2)</bold></disp-formula>
        <p>The cosine similarity metric is used to measure the similarity between the sentence embedding vector and the feature vectors of candidate keywords or keyphrases.</p>
        <disp-formula>Cos_SIM(E<sub>i</sub>, E<sub>s</sub>) = (E<sub>i</sub> × E<sub>s</sub>)/(&#124;&#124;E<sub>i</sub>&#124;&#124; × &#124;&#124;E<sub>s</sub>&#124;&#124;) <bold>(3)</bold></disp-formula>
        <p>The top-scoring keywords or keyphrases are returned as the most relevant to the document. Additionally, key medical terms are directly extracted using the medical diagnostic table, ensuring that essential terminology is accurately identified and applied.</p>
      </sec>
      <sec>
        <title>Fine-Tuning and Prompt Learning</title>
        <p>To fully leverage the clinical knowledge embedded within the dataset, our fine-tuning approach mirrors the unsupervised task used in the initial pretraining phase, known as masked language modeling (MLM). MLM involves randomly masking a predetermined proportion of input tokens, and the model then attempts to predict these masked tokens based on context. This process, commonly called a Cloze task, helps the model learn contextual relationships effectively.</p>
        <p>For the fine-tuning phase in this study, we maintained the MLM framework to align with the pretraining procedure. A consistent masking rate of 15% was applied across the dataset. In addition to the fine-tuning process, we introduced prompt learning during parameter tuning. This approach involved the construction of a template comprising 4 distinct components: the input text, a soft prompt, a manual prompt, and a masking component. The manual prompt included discrete tokens that reflected the downstream task expected by the PLM. By contrast, the soft prompt comprised trainable continuous vectors, which enhanced the model’s adaptability.</p>
        <p>Formally, automatic <italic>ICD</italic> coding, as a text multiclassification task, can be denoted as (<italic>x</italic>, <italic>y</italic>), where <italic>x</italic> is the set of discharge summaries and <italic>y</italic> is the <italic>ICD</italic> code set of the 17 chosen discharge diagnoses as labels. Given a clinical record <italic>x</italic> ∈ <italic>X</italic>, it can be annotated with <italic>ICD</italic> codes of discharge diagnosis <italic>y<sup>x</sup></italic> ∈ <italic>Y</italic> and a sequence of discrete input tokens <italic>x</italic> = (<italic>x</italic><sub>0</sub>, <italic>x</italic><sub>1</sub>, ..., <italic>x<sub>k</sub></italic>), where <italic>k</italic> is the number of tokens in the sequence. Prompt learning can be achieved via modifying the <italic>x</italic> to a prompt format <italic>x</italic> = fp(<italic>x</italic>), where the template f<sub>p</sub>(·) will insert a number of extra embeddings to <italic>x</italic> along with a masked token, denoted by &#60;[MASK]&#62;. Compared with hard prompts, soft prompts replace some fixed manual components with trainable embeddings (continuous vectors) of the same dimension as the PLM. After that, <italic>x</italic> is fed into <italic>M</italic>, to predict the masked token, which is in accordance with the objective of <italic>M</italic>. The output of <italic>M</italic> will be a distribution over the fixed vocabulary <italic>V</italic> of <italic>M</italic>. The next crucial step is to map tokens in <italic>V</italic> to <italic>y</italic> for the downstream task with a mapping <inline-graphic xlink:href="medinform_v13i1e63020_fig9.png" xlink:type="simple" mimetype="image"/>, known as verbalization. In a word, there are 2 essential components to be studied, the template of prompt <italic>x</italic>′ = f<sub>p</sub>(x) and the mapping of verbalizer <inline-graphic xlink:href="medinform_v13i1e63020_fig9.png" xlink:type="simple" mimetype="image"/>.</p>
        <p>A mixed template of prompts in this paper is used. For simplicity, the prompt function <italic>x</italic>′ = f<sub>p</sub>(x) is denoted as a sequence template:</p>
        <disp-formula>x′ = [P<sub>0</sub>, P<sub>1</sub>, …, P<sub>j</sub>], x, [P<sub>j</sub><sub>+1</sub>, P<sub>j</sub><sub>+2</sub>, …, P<sub>t</sub>], [MASK] <bold>(4)</bold></disp-formula>
        <p>where <italic>P<sub>i</sub></italic> refers to the <italic>i</italic>th token in the template and <italic>t</italic> is the number of prompt tokens beyond <italic>x</italic>. <italic>P<sub>i</sub></italic> does not necessarily meet <italic>P<sub>i</sub></italic> ∈ <italic>V</italic> other than manual hard prompt. As <italic>x</italic>′ is fed to the PLM, the prompt tokens are also mapped to the embedding space, where we can assume that the tokens denoted as &#60;[soft]&#62; in the template can be tuned during training as pseudo-tokens. A simple example of a prompt template for automatic <italic>ICD</italic> coding could be as generated as follows:</p>
        <disp-formula>x′ = &#60;x&#62;&#60;[soft]&#62;be encoded as &#60;[MASK]&#62; <bold>(5)</bold></disp-formula>
        <p>Once these templates were formulated, the model inputs, along with the established templates, were processed through the trainable MDR-BERT model. Notably, in the final layer of the most advanced pipeline, a soft verbalizer mode was used. This mode manages the mapping process between the predicted tokens and the final <italic>ICD</italic> codes. The innovative feature of the soft verbalizer is its substitution of tokens in the verbalizer with trainable vectors, each tailored to a specific class. Generally, the verbalizer maps the probabilities of infrequent words in the vocabulary to the probabilities of the labels. The set of label words is denoted as <italic>V</italic>, the label space is <italic>Y</italic>, and <italic>V<sub>y</sub></italic> represents the subset of label words for label <italic>y</italic>. The final estimation of the probability for label <italic>y</italic> is calculated using equation 5, where <italic>g</italic> is utilized to convert the probability of label words to the probability of the label:</p>
        <p><italic>P</italic>(<italic>y</italic>&#124;<italic>x</italic><sub>_</sub><italic><sub>P</sub></italic>) = <italic>g</italic>(<italic>P<sub>M</sub></italic>([MASK] = <italic>v</italic>&#124;<italic>X</italic><sub>_</sub><italic><sub>P</sub></italic>)&#124;<italic>v</italic> ∈ <italic>V<sub>y</sub></italic>) <bold>(6)</bold></p>
        <p>This strategy enhances the precision and semantic accuracy of the generated outputs, enabling a more precise alignment between predicted tokens and the definitive <italic>ICD</italic> codes. Consequently, it is unnecessary to manually build an explicit mapping <inline-graphic xlink:href="medinform_v13i1e63020_fig9.png" xlink:type="simple" mimetype="image"/>. for the soft verbalizer, as the trainable vectors do not have explainable semantic meaning. A matrix operator can represent the soft verbalizer as <inline-graphic xlink:href="medinform_v13i1e63020_fig10.png" xlink:type="simple" mimetype="image"/> [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>], where <italic>n</italic> represents the size of <italic>y</italic> and <italic>m</italic> represents the dimension of output embeddings from <italic>M</italic>. For the verbalizer, <italic>θ<sub>i</sub></italic> denotes the <italic>i</italic>th row of <inline-graphic xlink:href="medinform_v13i1e63020_fig11.png" xlink:type="simple" mimetype="image"/> as the trainable vector of the <italic>i</italic>th class. The soft verbalizer replaces the original decoder head of <italic>M</italic> by mapping the embeddings of <italic>x′</italic> from <italic>M</italic>, denoted as <italic>e</italic>(<italic>x′</italic>), to the distribution over the classes of <italic>y</italic>. We denote the resulting mapping from <inline-graphic xlink:href="medinform_v13i1e63020_fig12.png" xlink:type="simple" mimetype="image"/> to the prediction of the embedding of &#60;[MASK]&#62; as <inline-graphic xlink:href="medinform_v13i1e63020_fig13.png" xlink:type="simple" mimetype="image"/>, where <italic>l</italic> is the sequence length of <italic>x′</italic>. And then, the probability of class <italic>y</italic> can be calculated as follows:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e63020_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>The loss from the automatic <italic>ICD</italic> coding task can be backpropagated to tune only the embeddings for the prompt template and the verbalizer. The loss function can be expressed as follows:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e63020_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Ultimately, the model learns to generate and map the most appropriate <italic>ICD</italic> codes to the corresponding discharge record.</p>
        <p>The experiments were conducted using the OpenPrompt framework [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. For prompt learning, we utilized the Adafactor optimizer for soft and mixed prompt templates, while the AdamW optimizer was used for the PLMs and soft verbalizers. In conventional fine-tuning, we applied the AdamW optimizer to the MLP heads and PLMs. To expedite the experiments, we used 2 Nvidia TESLA V100 GPUs, each with 16-GB memory, and set the batch size to 32 due to memory constraints.</p>
        <p>The model’s performance is influenced by variations in hyperparameters. In the comparisons presented, hyperparameters were carefully optimized for each model. To determine the optimal configuration, we used a random search strategy. This approach involves generating multiple random combinations of parameters, evaluating the performance of each combination, and selecting the one that yields the best results. Accuracy and AUC were chosen as the primary optimization objectives during the random search, as they intuitively reflect the model’s classification performance. The strategy involved 100 training runs, each using randomly generated hyperparameters from the defined search space. To effectively address model overfitting, we carefully adjusted the dropout rate within a range of 0.1-0.5. After numerous training iterations, we found that the optimal dropout rate for the prompt learning phase is 0.382, while for the prompt tuning phase, it is 0.1563. In the prompt learning phase, a higher dropout rate contributes to improved generalization, serving as an effective safeguard against overfitting. In the subsequent fine-tuning phase, a lower dropout rate is used to ensure the model retains its learned attributes while enabling further performance enhancement. The optimal hyperparameters for the models are detailed in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>The optimal hyperparameters and their search space.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="300"/>
            <col width="300"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Hyperparameters</td>
                <td>Search space</td>
                <td colspan="2">Optimal hyperparameter</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Prompt learning</td>
                <td>Fine-tuning</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Learning rate</td>
                <td>log.uniform [1*10-5, 3*10-1]</td>
                <td>0.0048</td>
                <td>0.0121</td>
              </tr>
              <tr valign="top">
                <td>Batch size</td>
                <td>4</td>
                <td>4</td>
                <td>4</td>
              </tr>
              <tr valign="top">
                <td>Gradient accumulation steps</td>
                <td>range[2,10]</td>
                <td>4</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>Dropout</td>
                <td>range[0.1,0.5]</td>
                <td>0.382</td>
                <td>0.1563</td>
              </tr>
              <tr valign="top">
                <td>Optimizer</td>
                <td>[adamw, adafactor]</td>
                <td>adamw</td>
                <td>adafactor</td>
              </tr>
              <tr valign="top">
                <td>Prompt learning rate</td>
                <td>log.uniform[1*10-5, 3*10-1]</td>
                <td>0.3</td>
                <td>—<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>Verbalizer learning rate</td>
                <td>log.uniform[1*10-5, 1*10-1]</td>
                <td>0.007</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Not available.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Evaluation Metrics</title>
        <p>To thoroughly evaluate and compare the performance of the models, we used a range of metrics, including micro–<italic>F</italic><sub>1</sub>-score, macro-AUC, and accuracy. The definitions for micro-averaged precision and micro–<italic>F</italic><sub>1</sub>-score are provided in equations 9-11, while the macro-AUC is defined in equations 12 and 13.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e63020_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e63020_fig17.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Micro–<italic>F</italic><sub>1</sub>-score = [2 × (micro-<italic>P</italic>) × (micro-<italic>R</italic>)]/[(micro-<italic>P</italic>) + (micro-<italic>R</italic>)] <bold>(11)</bold></p>
        <p>where TP<italic><sub>i</sub></italic>, FP<italic><sub>i</sub></italic>, and FN<italic><sub>i</sub></italic> represent true positives (correctly assigned instances), false positives (incorrect assignments by automated methods), and false negatives (correct instances omitted by automated methods), respectively, of code <italic>i</italic>, and <italic>l</italic> is the size of the sample space. The micro–<italic>F</italic><sub>1</sub>-score is the harmonic mean of micro-<italic>P</italic> and micro-<italic>R</italic>, and a bigger value of micro–<italic>F</italic><sub>1</sub>-score indicates a better performance.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e63020_fig18.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e63020_fig19.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>where n is the number of thresholds and K is the number of classes.</p>
      </sec>
      <sec>
        <title>Data and Code Availability</title>
        <p>Data acquisition requests can be made by contacting the corresponding author (KH). Given the sensitive nature of the hospital data, it cannot be released publicly. However, part of the downstream subtask data is currently undergoing desensitization and approval processes. The source code for this study is publicly available on GitHub [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Performance of Different Pipelines</title>
        <p>To evaluate the performance of different methods, we implemented 4 state-of-the-art algorithms: BERT [<xref ref-type="bibr" rid="ref15">15</xref>], XLNet [<xref ref-type="bibr" rid="ref18">18</xref>], RoBERTa [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref36">36</xref>], and prompt learning [<xref ref-type="bibr" rid="ref22">22</xref>]. These PLMs were integrated with various algorithms to create 6 main pipelines: BERT with fine-tuning, XLNet with fine-tuning, RoBERTa with fine-tuning, BERT with prompt learning, MDR-BERT with prompt learning, and MDR-BERT with both fine-tuning and prompt learning. MDR-BERT is a PLM developed by further pretraining BERT on our medical corpus.</p>
        <p>As shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>, MDR-BERT with fine-tuning and prompt learning achieved the highest performance across all evaluation metrics, with a micro–<italic>F</italic><sub>1</sub>-score of 0.838, a macro-AUC of 0.958, and an accuracy of 0.838. MDR-BERT with prompt learning alone performed slightly worse than the combined fine-tuning and prompt learning approach, but both outperformed the other pipelines by a significant margin. This suggests that continued pretraining on clinical records can significantly enhance the performance of the PLM for the task, while freezing parameters may hinder the adaptation of smaller PLMs to the task.</p>
        <p>Among the other pipelines, BERT with prompt learning achieved the highest accuracy (0.67) and the highest micro–<italic>F</italic><sub>1</sub>-score (0.64), though its macro-AUC (0.79) was slightly lower than that of RoBERTa with fine-tuning. This suggests that prompt learning, as a lightweight tuning approach, can match or even surpass traditional fine-tuning methods, aligning with the findings of Taylor et al [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>We also conducted a comparison with state-of-the-art methods and selected 2 prominent models: mt5-xxl (11B) and Qwen2.5-72B-Instruct. Among these, mt5-xxl demonstrated the best performance in text classification, while Qwen2.5-72B-Instruct excelled as a large language model. For mt5-xxl, we fine-tuned the model using the training and validation sets from our fine-tuning dataset, setting the “prefix_text” to “Classify the following text:”. For Qwen2.5-72B-Instruct, we conducted experiments using both zero-shot and retrieval augmented generation methods. In the zero-shot setting, we used prompts to constrain the diagnostic scope, allowing the model to make inferences based on the input information. For the retrieval augmented generation approach, we first encoded the training set using BGE-M3 (BAAI general embedding multilinguality, multigranularity, and multifunctionality) and stored it in a Faiss vector database. During the testing phase, we retrieved cases and classification results relevant to the input content and concatenated them with the prompt to enhance model performance.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>The optimal hyperparameters and their search space. AUC: area under the receiver operating characteristic curve; BERT: bidirectional encoder representations from transformers; MDR: medical domain refinement; RoBERTa: robustly optimized BERT pretraining approach; XLNet: extreme language network.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63020_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The experimental results indicate that the micro–<italic>F</italic><sub>1</sub>-score for the mt5-xxl method is 0.846, and the AUC value is 0.945. In comparison, the micro–<italic>F</italic><sub>1</sub>-score for the Qwen2.5-72B-Instruct method was 0.822, and the AUC value was 0.848. However, the accuracy of both methods does not surpass that of our MDR-BERT model (<xref rid="figure5" ref-type="fig">Figure 5</xref>). After a series of strategic optimizations, our MDR-BERT model achieved results comparable to the fine-tuned mt5-xxl on specific tasks. This is primarily due to the specific structure of the medical records, which can be effectively captured by models with fewer parameters, meaning that overly complex models are not necessary to achieve good performance.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Micro-<italic>F</italic><sub>1</sub>-score and AUC values for the MDR-BERT model versus the QWEN2.5 and mt5-xxl models. AUC: area under the receiver operating characteristic curve; MDR: medical domain refinement; BERT: bidirectional encoder representations from transformers.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63020_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Performance of Different Prompt Learning Modes</title>
        <p>We evaluated the performance of MDR-BERT under various settings of prompt learning and fine-tuning, using 3 types of templates (manual, soft, and mixed) and 2 types of verbalizers (manual and soft) as hyperparameters.</p>
        <p>For templates, both scripted and self-adaptive patterns performed well independently, and their combination had a cumulative positive effect on performance. For verbalizers, the self-adaptive type outperformed the traditional manual vectors and had a greater impact on overall performance. As shown in <xref rid="figure6" ref-type="fig">Figure 6</xref>, the combination of mixed templates and the soft verbalizer achieved the best results.</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Comparison among different prompt combinations in verbalizer and template. AUC: area under the receiver operating characteristic curve.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63020_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Take the following prompt template as an example:</p>
        <p>Mixed template: {“placeholder”: “text_a”} patient {“soft”:“ can be diagnosed as ”} {“mask”}.</p>
        <p>For the following case:</p>
        <disp-quote>
          <p>The patient was discovered to have bradycardia and unconscious disturbance 7 days ago as a result of physical examination. After consultation with the director, lipid-lowering drugs were added. No diarrhea was detected, and no medication was administered at home. Permanent cardiac pacemaker implantation under local anesthesia was carried out, and after the surgery, cephalosporin for injection was utilized to prevent infection.</p>
        </disp-quote>
        <p>The classification result by our model is as follows: “The patient can be diagnosed as {third-degree atrioventricular block}.”</p>
        <p>For the mixed template, the patient’s bradycardia requires management through the implantation of a permanent pacemaker, indicating that bradycardia is a major medical concern. By applying soft verbalizers, we can guide the correct diagnosis by emphasizing both the reason for the pacemaker implantation and the underlying cause of bradycardia: “The patient can be diagnosed with third-degree atrioventricular block.”</p>
      </sec>
      <sec>
        <title>Performance of MDR-BERT With Fine-Tuning and Prompt Learning</title>
        <p>We evaluated the performance of the MDR-BERT pipeline, incorporating both fine-tuning and prompt learning, for each ICD code using precision, recall, and micro–<italic>F</italic><sub>1</sub>-score. <xref rid="figure7" ref-type="fig">Figure 7</xref> presents the results for these metrics across the 13 ICD classes.</p>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Precision, recall, and micro-<italic>F</italic><sub>1</sub> scores of every ICD code in the MDR-BERT pipeline with fine-tuning and prompt learning. BERT: bidirectional encoder representations from transformers; ICD: International Classification of Diseases; MDR: medical domain refinement.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63020_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The pipeline achieved high scores for most <italic>ICD</italic> codes, although the scores varied depending on the data distribution and sample size for each code. We observed a weak positive correlation between sample size and model performance, suggesting that larger samples enhanced the model’s learning capability. Conversely, smaller samples tended to have lower micro–<italic>F</italic><sub>1</sub>-scores, with a trade-off between precision and recall for certain classes. Although our prediction accuracy for ACSs is relatively low, further analysis revealed that in actual clinical settings, ACS was frequently misdiagnosed as cardiac edema (hypertensive emergency) and pulmonary embolism (acute pulmonary embolism). These diseases exhibit similar clinical manifestations and, therefore, require meticulous differential diagnosis to rule out other possibilities. We believe that the overlap of symptoms is a major cause of the difficulty in classifying the model and that inconsistencies in medical histories recorded by physicians further complicate the model’s ability to differentiate similar pathologies. Despite these variations, our pipeline demonstrated satisfactory performance across the different <italic>ICD</italic> codes.</p>
      </sec>
      <sec>
        <title>Few-Shot Learning</title>
        <p>We conducted few-shot experiments to evaluate the performance of the fine-tuned MDR-BERT with the prompt learning pipeline using different sample sizes from the training set. We randomly selected samples ranging from 1 to 4000 and evaluated the models on the test set. <xref rid="figure8" ref-type="fig">Figure 8</xref> shows the accuracy, micro–<italic>F</italic><sub>1</sub>-score, and macro-AUC scores for each sample size.</p>
        <p>The objective of small-sample learning is to develop models that can learn effectively and make accurate predictions with only a small number of samples, such as 500 or fewer. As shown in <xref rid="figure8" ref-type="fig">Figure 8</xref>, when the sample size reaches 500, the model’s accuracy, AUC score, and other indicators not only achieve relatively high scores but also reach an inflection point and plateau. At this point, the model produces a relatively satisfactory outcome. This indicates that for the task of <italic>ICD</italic> coding using medical records, 500 samples may be sufficient for the model to learn the key features needed to distinguish between different diagnoses. It suggests that the model has captured enough information to make effective predictions. Additionally, the workload involved in annotating 500 medical texts is manageable and feasible. This number strikes a balance between the effort required for data preparation and the performance gains achieved by the model. Given the complexity and specialized nature of medical records, annotating 500 examples provides a comprehensive representation of the dataset while staying within practical limits. This makes it a reasonable and efficient choice for training the model to achieve satisfactory performance in <italic>ICD</italic> coding tasks.</p>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Few shots experiments on MDR-BERT with fine-tuning and prompt learning. AUC: area under the receiver operating characteristic curve; BERT: bidirectional encoder representations from transformers; MDR: medical domain refinement.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63020_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>An automated <italic>ICD</italic> coding system for long free-text data is a fundamental platform for clinical research and practice, including clinical trials and pharmacoeconomic management. In this study, we developed a framework based on Key-BERT, a continuously trained and tunable PLM, combined with task-specific prompt learning. We collected a total of 584,969 clinical notes from admission records and discharge summaries in the cardiovascular departments of 8 medical centers.</p>
        <p>We used most of the data to continue pretraining a medical corpus and used an independent set of 9337 discharge records with 13 <italic>ICD</italic> codes for CVDs in the <italic>ICD</italic> classification subtask. Although the MDR-BERT model has some limitations, such as restricted generalization capacity and constraints on the length of context it can effectively process, it is important to note that medical texts often have a consistent structure and are generally less dependent on extensive contextual information. Given these characteristics of medical literature, our model is designed to avoid the errors commonly associated with the inherent limitations of BERT’s methodology. The structured nature of medical documents enables the MDR-BERT model to function effectively within its designed parameters, mitigating potential issues that could arise from the broader weaknesses of the BERT framework when applied to more contextually complex or varied text types. To remove irrelevant information and limit the input token size, we filtered and truncated all the data for the <italic>ICD</italic> task into keyword-based segments using Key-BERT. The data were then stratified and split into training, validation, and test sets, with the test set used independently for final evaluation.</p>
        <p>This study primarily focused on transformer-based algorithms, which have been widely applied and shown superior performance in large-scale medical long free-text tasks [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. These algorithms can leverage PLMs that capture the semantic and syntactic information of natural language from extensive corpora, leading to significant performance improvements through multicenter datasets.</p>
        <p>We compared 6 pipelines for the classification downstream task: BERT with fine-tuning, XLNet with fine-tuning, RoBERTa with fine-tuning, frozen BERT with prompt learning, frozen MDR-BERT with prompt learning, and tunable MDR-BERT with prompt learning. The prompt learning setup included 3 types of templates and 2 types of verbalizers. Among these pipelines, MDR-BERT with fine-tuning and prompt learning achieved the best performance on the test set, attaining a micro–<italic>F</italic><sub>1</sub>-score of 0.838, a macro-AUC of 0.958, and an accuracy of 0.838.</p>
        <p>Compared with the pretraining models of RoBERTa and XLNet, our model achieved superior performance in terms of final accuracy and micro–<italic>F</italic><sub>1</sub>-score. This improvement was primarily due to the targeted optimization of the methods and the medical data we selected, which substantially enhanced the model’s performance. Although RoBERTa and XLNet have larger pretraining corpora compared with BERT, our approach benefited more from using a continuation training corpus built from real electronic health records. This specialized data, tailored to our specific requirements, provided a greater enhancement to the model than more general pretraining data. This is why MDR-BERT performs comparably to, or better than, these alternatives in our settings. The favorable outcome of this pipeline can be attributed to the use of a large-scale corpus-based PLM and the task-specific enhancements from the combination of fine-tuning and prompt learning [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. Fine-tuning acts as a model adapter, aligning the model distribution with the task distribution and addressing domain shift and task mismatch issues inherent in PLMs. Prompt learning, with its compact prefix representation and sparse attention mechanism, augments the training data with diverse and natural examples. This augmentation helps mitigate data scarcity and label noise issues in small-sized datasets for downstream tasks.</p>
        <p>The combination of fine-tuning and prompt learning acts as a regularization term that balances model complexity with data quality, ultimately enhancing overall performance. This integrated approach highlights the potential of leveraging advanced transformer-based models and customized learning strategies to improve automated medical coding and other clinical tasks.</p>
        <p>Among the different prompt learning setups, the mixed template and soft verbalizer achieved the best performance. The soft template method outperformed the manual templates method, which can be attributed to the greater semantic and syntactic information, broader search space, and reduced trial-and-error process associated with the soft template method, making it more effective and less time-consuming [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        <p>The mixed template method is a hybrid approach that combines the advantages of both soft and manual templates. It uses a manual template as a base prompt to provide human-readable instructions and natural language labels, while a soft template serves as an auxiliary prompt to provide tunable embeddings that can adapt to specific downstream tasks. This way, the manual template leverages existing knowledge, while the soft template enhances expressiveness and flexibility.</p>
        <p>For the verbalizer, the self-adaptive type had a significantly greater impact on overall performance compared with traditional manual vectors. The soft verbalizer adjusts to the optimal label space for each task and the scale of the pretrained model, rather than being limited by a fixed set of tokens [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. This enhances the accuracy and robustness of the predictions, as well as the diversity and naturalness of the labels. Additionally, by tuning the verbalizer alongside other continuous prompts, it retains the benefits of prompt tuning over fine-tuning, eliminating the need to maintain a separate copy of model parameters for each task during inference.</p>
        <p>To explore the influence of sample size on the performance of our pipeline, we conducted few-shot experiments with a range from 1 to 4000 shots. The results showed unsatisfactory evaluation metrics for small-scale shots, but performance improved rapidly and stabilized at around 500 shots. This suggests that for mid-sized language models, such as BERT, the semantic understanding and representation capabilities may not be strong enough. Therefore, tuning the parameters of the PLM with an appropriate sample size is necessary to achieve better performance on specific tasks.</p>
        <p>Our research confirms that <italic>ICD</italic> classification tasks can be effectively accomplished by continuously optimizing the BERT model. Although this study used cardiology data for training, our model development strategy is not limited to this specific dataset; substituting the training data with data from other departments would also yield the expected outcomes. Therefore, our model demonstrates remarkable generalization capability. We firmly believe that the model we have developed, combined with the expertise of professional physicians, can effectively address the challenges of <italic>ICD</italic> classification for various diseases.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Despite the reasonable performance of our pipeline, this study has certain limitations. First, we trained both the corpus part and the classification task of the framework solely in the cardiovascular department. As a result, the conclusions of this paper may not be generalizable to other medical fields. Second, the <italic>ICD</italic> classification subtask only involved 13 CVD codes, which is not comprehensive enough for clinical practice. Future research could expand to explore the automatic encoding of additional critical heart diseases or even extend to the entire clinical field. This could potentially enhance the applicability and effectiveness of the proposed approach for a broader range of clinical tasks. Third, our model aims to establish an automated analysis system using medical text. However, medical data are inherently multimodal, and modality augmentation can lead to improvements in accuracy. In this context, models such as label alignment for multimodal prompt learning [<xref ref-type="bibr" rid="ref37">37</xref>] and multimodal equivalent transformer [<xref ref-type="bibr" rid="ref38">38</xref>] are designed to handle multimodal data, demonstrating the greater potential for future advancements.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We proposed a real-time framework for <italic>ICD</italic> coding from long medical field–related text to <italic>ICD</italic> labels, eliminating the need for semistructured preprocessing. This framework incorporates Key-BERT, a continuously trained and tunable PLM, and task-specific prompt learning with mixed templates and soft verbalizers. We evaluated our model on a multicenter cardiovascular dataset and applied it to predict 13 <italic>ICD</italic> codes for CVDs, achieving high performance. Our model also demonstrated transferability and generalization across different centers.</p>
        <p>Furthermore, we conducted few-shot experiments to investigate the impact of data size on model performance. The results showed that while the framework was effective on smaller datasets, a certain sample size was necessary to achieve a relatively stable performance level. This study serves as a benchmark for exploring the feasibility and performance of prompt learning in the subtask of large language models or PLMs. Using a multicenter dataset, the approach demonstrated robust performance across hospitals, highlighting its potential for broad deployment.</p>
        <p>Few-shot learning experiments demonstrated feasibility with small-scale datasets, enabling applications for local training on single centers or various single-disease databases. The real-time model identifies <italic>ICD</italic> codes directly, accelerating automated coding compared with semiautomatic approaches that require segment preprocessing. This is particularly impactful for clinical decision support systems that rely on real-time <italic>ICD</italic> coding data.</p>
        <p>Overall, the prompt learning paradigm achieved cutting-edge <italic>ICD</italic> assignment accuracy while offering deployability, few-shot learning capacity, and low latency—advantages that are highly beneficial for health care applications. This automated <italic>ICD</italic> coding pipeline could be further implemented in various clinical applications, such as clinical decision support systems, cohort studies, and disease early warning and diagnosis systems.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ACS</term>
          <def>
            <p>acute coronary syndrome</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">BGE-M3</term>
          <def>
            <p>BAAI general embedding multilinguality, multigranularity, and multifunctionality</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CVD</term>
          <def>
            <p>cardiovascular disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ICD</term>
          <def>
            <p>International Classification of Diseases</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">MDR</term>
          <def>
            <p>medical domain refinement</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MLM</term>
          <def>
            <p>masked language modeling</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">PLM</term>
          <def>
            <p>pretrained language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">RoBERTa</term>
          <def>
            <p>robustly optimized BERT pretraining approach</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">Transformer-XL</term>
          <def>
            <p>Transformer-Extra-Long</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">XLNet</term>
          <def>
            <p>extreme language network</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the National Key R&#38;D Program of China (grant 2021ZD0140408) and the Independent Research Project of Medical Engineering Laboratory of Chinese PLA General Hospital (grant 2022SYSZZKY23).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Steindel</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>International Classification of Diseases, 10th Edition, clinical modification and procedure coding system: descriptive overview of the next generation HIPAA code sets</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>3</issue>
          <fpage>274</fpage>
          <lpage>82</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/20442144"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2009.001230</pub-id>
          <pub-id pub-id-type="medline">20442144</pub-id>
          <pub-id pub-id-type="pii">17/3/274</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995704</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O'Malley</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>KF</given-names>
            </name>
            <name name-style="western">
              <surname>Price</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Wildes</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Hurdle</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Ashton</surname>
              <given-names>CM</given-names>
            </name>
          </person-group>
          <article-title>Measuring diagnoses: ICD code accuracy</article-title>
          <source>Health Serv Res</source>
          <year>2005</year>
          <month>10</month>
          <volume>40</volume>
          <issue>5 Pt 2</issue>
          <fpage>1620</fpage>
          <lpage>39</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/16178999"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/j.1475-6773.2005.00444.x</pub-id>
          <pub-id pub-id-type="medline">16178999</pub-id>
          <pub-id pub-id-type="pii">HESR444</pub-id>
          <pub-id pub-id-type="pmcid">PMC1361216</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kusnoor</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Blasingame</surname>
              <given-names>MN</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>DesAutels</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Giuse</surname>
              <given-names>NB</given-names>
            </name>
          </person-group>
          <article-title>A narrative review of the impact of the transition to ICD-10 and ICD-10-CM/PCS</article-title>
          <source>JAMIA Open</source>
          <year>2020</year>
          <month>04</month>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>126</fpage>
          <lpage>131</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32607494"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooz066</pub-id>
          <pub-id pub-id-type="medline">32607494</pub-id>
          <pub-id pub-id-type="pii">ooz066</pub-id>
          <pub-id pub-id-type="pmcid">PMC7309233</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hung</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Automatic International Classification of Diseases coding system: deep contextualized language model with rule-based approaches</article-title>
          <source>JMIR Med Inform</source>
          <year>2022</year>
          <month>06</month>
          <day>29</day>
          <volume>10</volume>
          <issue>6</issue>
          <fpage>e37557</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2022/6/e37557/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/37557</pub-id>
          <pub-id pub-id-type="medline">35767353</pub-id>
          <pub-id pub-id-type="pii">v10i6e37557</pub-id>
          <pub-id pub-id-type="pmcid">PMC9282222</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Upadhyaya</surname>
              <given-names>SG</given-names>
            </name>
            <name name-style="western">
              <surname>Murphree</surname>
              <given-names>DH</given-names>
            </name>
            <name name-style="western">
              <surname>Ngufor</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Knight</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Cronk</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Cima</surname>
              <given-names>RR</given-names>
            </name>
            <name name-style="western">
              <surname>Curry</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Pathak</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Kor</surname>
              <given-names>DJ</given-names>
            </name>
          </person-group>
          <article-title>Automated diabetes case identification using electronic health record data at a tertiary care facility</article-title>
          <source>Mayo Clin Proc Innov Qual Outcomes</source>
          <year>2017</year>
          <month>07</month>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>100</fpage>
          <lpage>110</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2542-4548(17)30008-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.mayocpiqo.2017.04.005</pub-id>
          <pub-id pub-id-type="medline">30225406</pub-id>
          <pub-id pub-id-type="pii">S2542-4548(17)30008-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC6135013</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Diao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Huo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lian</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Automated ICD coding for primary diagnosis via clinically interpretable machine learning</article-title>
          <source>Int J Med Inform</source>
          <year>2021</year>
          <month>09</month>
          <volume>153</volume>
          <fpage>104543</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1386-5056(21)00169-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2021.104543</pub-id>
          <pub-id pub-id-type="medline">34391016</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(21)00169-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maheshwari</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>Aman</given-names>
            </name>
            <name name-style="western">
              <surname>Shukla</surname>
              <given-names>Anupam</given-names>
            </name>
            <name name-style="western">
              <surname>Tiwari</surname>
              <given-names>Ritu</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive evaluation for the prediction of mortality in intensive care units with LSTM networks: patients with cardiovascular disease</article-title>
          <source>Biomed Tech (Berl)</source>
          <year>2020</year>
          <month>08</month>
          <day>27</day>
          <volume>65</volume>
          <issue>4</issue>
          <fpage>435</fpage>
          <lpage>446</lpage>
          <pub-id pub-id-type="doi">10.1515/bmt-2018-0206</pub-id>
          <pub-id pub-id-type="medline">31846424</pub-id>
          <pub-id pub-id-type="pii">bmt-2018-0206</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Medical code prediction via capsule networks and ICD knowledge</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2021</year>
          <month>07</month>
          <day>30</day>
          <volume>21</volume>
          <issue>Suppl 2</issue>
          <fpage>55</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-021-01426-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-021-01426-9</pub-id>
          <pub-id pub-id-type="medline">34330264</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-021-01426-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC8323200</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kreuzthaler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pfeifer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kramer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Secondary use of clinical problem list entries for neural network-based disease code assignment</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2023</year>
          <month>5</month>
          <volume>18</volume>
          <issue>302</issue>
          <fpage>788</fpage>
          <lpage>792</lpage>
          <pub-id pub-id-type="doi">10.3233/shti230267</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Deep contextualized word representations</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1802.05365"/>
          </comment>
          <comment>Preprint posted online on March 22, 2018</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1802.05365</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hon</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Unified language model pre-training for natural language understanding and generation</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1905.03197"/>
          </comment>
          <comment>Preprint posted online on October 15, 2019</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1905.03197</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ghazvininejad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Stoyanov</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1910.13461"/>
          </comment>
          <comment>Preprint posted online on October 29, 2019</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1910.13461</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1810.04805"/>
          </comment>
          <comment>Preprint posted online on May 24, 2019</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Coutinho</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Martins</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Transformer-based models for ICD-10 coding of death certificates with Portuguese text</article-title>
          <source>J Biomed Inform</source>
          <year>2022</year>
          <month>12</month>
          <volume>136</volume>
          <fpage>104232</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(22)00237-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2022.104232</pub-id>
          <pub-id pub-id-type="medline">36307020</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(22)00237-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>McAuley</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>EY</given-names>
            </name>
            <name name-style="western">
              <surname>Gentili</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>RadBERT: adapting transformer-based language models to radiology</article-title>
          <source>Radiol Artif Intell</source>
          <year>2022</year>
          <month>07</month>
          <volume>4</volume>
          <issue>4</issue>
          <fpage>e210258</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35923376"/>
          </comment>
          <pub-id pub-id-type="doi">10.1148/ryai.210258</pub-id>
          <pub-id pub-id-type="medline">35923376</pub-id>
          <pub-id pub-id-type="pmcid">PMC9344353</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stoyanov</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1907.11692"/>
          </comment>
          <comment>Preprint posted online on July 26, 2019</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elbattah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Arnaud</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Gignon</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dequen</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>The role of text analytics in healthcare: a review of recent developments and applications</article-title>
          <year>2021</year>
          <conf-name>14th International Joint Conference on Biomedical Engineering Systems and Technologies - Scale-IT-up</conf-name>
          <conf-date>February 11–13, 2021</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <pub-id pub-id-type="doi">10.5220/0010414508250832</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sezgin</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hussain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rust</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Extracting medical information from free-text and unstructured patient-generated health data using natural language processing methods: feasibility study with real-world data</article-title>
          <source>JMIR Form Res</source>
          <year>2023</year>
          <month>03</month>
          <day>07</day>
          <volume>7</volume>
          <fpage>e43014</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://formative.jmir.org/2023//e43014/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/43014</pub-id>
          <pub-id pub-id-type="medline">36881467</pub-id>
          <pub-id pub-id-type="pii">v7i1e43014</pub-id>
          <pub-id pub-id-type="pmcid">PMC10031450</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Carbonell</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Salakhutdinov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>XLNet: generalized autoregressive pretraining for language understanding</article-title>
          <year>2019</year>
          <month>9</month>
          <conf-name>NIPS'19: 33rd International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 8-14, 2019</conf-date>
          <conf-loc>Vancouver, BC, Canada</conf-loc>
          <fpage>5753</fpage>
          <lpage>5763</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>GPT understands, too</article-title>
          <source>AI Open</source>
          <year>2024</year>
          <volume>5</volume>
          <fpage>208</fpage>
          <lpage>215</lpage>
          <pub-id pub-id-type="doi">10.1016/j.aiopen.2023.08.012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schick</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schütze</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Exploiting cloze questions for few shot text classification and natural language inference</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2001.07676"/>
          </comment>
          <comment>Preprint posted online on January 25, 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2001.07676</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Joyce</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kormilitzin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nevado-Holgado</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Clinical prompt learning with frozen language models</article-title>
          <source>IEEE Trans Neural Netw Learning Syst</source>
          <year>2024</year>
          <month>11</month>
          <volume>35</volume>
          <issue>11</issue>
          <fpage>16453</fpage>
          <lpage>16463</lpage>
          <pub-id pub-id-type="doi">10.1109/tnnls.2023.3294633</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Prefix-tuning: optimizing continuous prompts for generation</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2101.00190"/>
          </comment>
          <comment>Preprint posted online on January 21, 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2101.00190</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>P-Tuning v2: prompt tuning can be comparable to fine-tuning universally across scales and tasks</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2110.07602"/>
          </comment>
          <comment>Preprint posted online on March 20, 2022</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2110.07602</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>OpenPrompt: an open-source framework for prompt-learning</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2111.01998"/>
          </comment>
          <comment>Preprint posted online on November 3, 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2111.01998</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>FF</given-names>
            </name>
            <name name-style="western">
              <surname>Araki</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Neubig</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>How can we know what language models know?</article-title>
          <source>arXiv</source>
          <month>5</month>
          <day>3</day>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1911.12543"/>
          </comment>
          <comment>Preprint posted online on May 3, 2020</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1911.12543</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haviv</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Berant</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Globerson</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>BERTese: learning to speak to BERT</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2103.05327"/>
          </comment>
          <comment>Preprint posted online on March 11, 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2103.05327</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kandpai</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Universal adversarial triggers for attacking and analyzing NLP</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1908.07125"/>
          </comment>
          <comment>Preprint posted online on January 3, 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1908.07125</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Template-based named entity recognition using BART</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2106.01760"/>
          </comment>
          <comment>Preprint posted online on June 3, 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2106.01760</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Fisch</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Making pre-trained language models better few-shot learners</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2012.15723"/>
          </comment>
          <comment>Preprint posted online on June 2, 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2012.15723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Thomson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Roy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Platanios</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Pauls</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Eisner</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Durme</surname>
              <given-names>BV</given-names>
            </name>
          </person-group>
          <article-title>Constrained language models yield few-shot semantic parsers</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2104.08768"/>
          </comment>
          <comment>Preprint posted online on November 16, 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2104.08768</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schick</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schmid</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Schutze</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Automatically identifying words that can serve as labels for few-shot text classification</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2010.13641"/>
          </comment>
          <comment>Preprint posted online on October 26, 2020</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2010.13641</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1706.03762"/>
          </comment>
          <comment>Preprint posted online on August 2, 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Self-supervised contextual keyword and keyphrase retrieval with self-labelling</article-title>
          <source>Preprints</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.preprints.org/manuscript/201908.0073/v1"/>
          </comment>
          <comment>Preprint posted online on August 6, 2019</comment>
          <pub-id pub-id-type="doi">10.20944/preprints201908.0073.v1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhuang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>ICD_promptLearning</article-title>
          <source>GitHub</source>
          <year>2024</year>
          <access-date>2024-12-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/PLA301dbgroup2/ICD_promptLearning">https://github.com/PLA301dbgroup2/ICD_promptLearning</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tinn</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Usuyama</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Poon</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Fine-tuning large neural language models for biomedical natural language processing</article-title>
          <source>Patterns (N Y)</source>
          <year>2023</year>
          <month>04</month>
          <day>14</day>
          <volume>4</volume>
          <issue>4</issue>
          <fpage>100729</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2666-3899(23)00069-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.patter.2023.100729</pub-id>
          <pub-id pub-id-type="medline">37123444</pub-id>
          <pub-id pub-id-type="pii">S2666-3899(23)00069-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC10140607</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ruan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>LAMM: label alignment for multi-modal prompt learning</article-title>
          <year>2024</year>
          <month>03</month>
          <day>24</day>
          <conf-name>The Thirty-Eighth AAAI Conference on Artificial Intelligence (AAAI-24)</conf-name>
          <conf-date>February 20-27, 2024</conf-date>
          <conf-loc>Vancouver, BC, Canada</conf-loc>
          <fpage>1815</fpage>
          <lpage>1823</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ojs.aaai.org/index.php/AAAI/article/view/27950/27920"/>
          </comment>
          <pub-id pub-id-type="doi">10.1609/aaai.v38i3.27950</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Guan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ruan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Learning robust visual-semantic embedding for generalizable person re-identification</article-title>
          <source>arXiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2304.09498"/>
          </comment>
          <comment>Preprint posted online on April 19, 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2304.09498</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
