<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v13i1e68138</article-id>
      <article-id pub-id-type="pmid">40465350</article-id>
      <article-id pub-id-type="doi">10.2196/68138</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Trajectory-Ordered Objectives for Self-Supervised Representation Learning of Temporal Healthcare Data Using Transformers: Model Development and Evaluation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Turbe</surname>
            <given-names>Hugues</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hung</surname>
            <given-names>Chih-Chieh</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Amirahmadi</surname>
            <given-names>Ali</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Center for Applied Intelligent Systems Research in Health</institution>
            <institution>Halmstad University</institution>
            <addr-line>Kristian IV:s väg 3</addr-line>
            <addr-line>Halmstad, 30118</addr-line>
            <country>Sweden</country>
            <phone>46 03516 7100</phone>
            <email>ali.amirahmadi@hh.se</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1999-8435</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Etminani</surname>
            <given-names>Farzaneh</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2006-6229</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Björk</surname>
            <given-names>Jonas</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1883-2000</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Melander</surname>
            <given-names>Olle</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2581-484X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Ohlsson</surname>
            <given-names>Mattias</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1145-4297</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Center for Applied Intelligent Systems Research in Health</institution>
        <institution>Halmstad University</institution>
        <addr-line>Halmstad</addr-line>
        <country>Sweden</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Center for Applied Intelligent Systems Research</institution>
        <institution>Halmstad University</institution>
        <addr-line>Halmstad</addr-line>
        <country>Sweden</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of research and development (FoU)</institution>
        <institution>Region Halland</institution>
        <addr-line>HALMSTAD</addr-line>
        <country>Sweden</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Division of Occupational and Environmental Medicine</institution>
        <institution>Lund University</institution>
        <addr-line>Lund</addr-line>
        <country>Sweden</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Clinical Sciences</institution>
        <institution>Lund University</institution>
        <addr-line>Lund</addr-line>
        <country>Sweden</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Centre for Environmental and Climate Science</institution>
        <institution>Lund University</institution>
        <addr-line>Lund</addr-line>
        <country>Sweden</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Ali Amirahmadi <email>ali.amirahmadi@hh.se</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>4</day>
        <month>6</month>
        <year>2025</year>
      </pub-date>
      <volume>13</volume>
      <elocation-id>e68138</elocation-id>
      <history>
        <date date-type="received">
          <day>29</day>
          <month>10</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>13</day>
          <month>12</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>26</day>
          <month>1</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>28</day>
          <month>3</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Ali Amirahmadi, Farzaneh Etminani, Jonas Björk, Olle Melander, Mattias Ohlsson. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 04.06.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2025/1/e68138" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The growing availability of electronic health records (EHRs) presents an opportunity to enhance patient care by uncovering hidden health risks and improving informed decisions through advanced deep learning methods. However, modeling EHR sequential data, that is, patient trajectories, is challenging due to the evolving relationships between diagnoses and treatments over time. Significant progress has been achieved using transformers and self-supervised learning. While BERT-inspired models using masked language modeling (MLM) capture EHR context, they often struggle with the complex temporal dynamics of disease progression and interventions.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to improve the modeling of EHR sequences by addressing the limitations of traditional transformer-based approaches in capturing complex temporal dependencies.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We introduce Trajectory Order Objective BERT (Bidirectional Encoder Representations from Transformers; TOO-BERT), a transformer-based model that advances the MLM pretraining approach by integrating a novel TOO to better learn the complex sequential dependencies between medical events. TOO-Bert enhanced the learned context by MLM by pretraining the model to distinguish ordered sequences of medical codes from permuted ones in a patient trajectory. The TOO is enhanced by a conditional selection process that focus on medical codes or visits that frequently occur together, to further improve contextual understanding and strengthen temporal awareness. We evaluate TOO-BERT on 2 extensive EHR datasets, MIMIC-IV hospitalization records and the Malmo Diet and Cancer Cohort (MDC)—comprising approximately 10 and 8 million medical codes, respectively. TOO-BERT is compared against conventional machine learning methods, a transformer trained from scratch, and a transformer pretrained on MLM in predicting heart failure (HF), Alzheimer disease (AD), and prolonged length of stay (PLS).</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>TOO-BERT outperformed conventional machine learning methods and transformer-based approaches in HF, AD, and PLS prediction across both datasets. In the MDC dataset, TOO-BERT improved HF and AD prediction, increasing area under the receiver operating characteristic curve (AUC) scores from 67.7 and 69.5 with the MLM-pretrained Transformer to 73.9 and 71.9, respectively. In the MIMIC-IV dataset, TOO-BERT enhanced HF and PLS prediction, raising AUC scores from 86.2 and 60.2 with the MLM-pretrained Transformer to 89.8 and 60.4, respectively. Notably, TOO-BERT demonstrated strong performance in HF prediction even with limited fine-tuning data, achieving AUC scores of 0.877 and 0.823, compared to 0.839 and 0.799 for the MLM-pretrained Transformer, when fine-tuned on only 50% (442/884) and 20% (176/884) of the training data, respectively.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>These findings demonstrate the effectiveness of integrating temporal ordering objectives into MLM-pretrained models, enabling deeper insights into the complex temporal relationships inherent in EHR data. Attention analysis further highlights TOO-BERT’s capability to capture and represent sophisticated structural patterns within patient trajectories, offering a more nuanced understanding of disease progression.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>patient trajectories</kwd>
        <kwd>disease prediction</kwd>
        <kwd>representation learning</kwd>
        <kwd>masked language mode</kwd>
        <kwd>deep learning</kwd>
        <kwd>BERT</kwd>
        <kwd>electronic health record</kwd>
        <kwd>language mode</kwd>
        <kwd>transformer</kwd>
        <kwd>heart failure</kwd>
        <kwd>alzheimer disease</kwd>
        <kwd>prolonged health of stay</kwd>
        <kwd>effectiveness</kwd>
        <kwd>temporal</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>In modern health care, electronic health records (EHRs) are crucial as comprehensive repositories encompassing a wide range of patient data, including diagnoses, medications, treatments, laboratory data, and demographic information. The accumulation of EHR data longitudinally builds EHR trajectories, sometimes called patient trajectories. This information serves as an important resource for assessing a patient’s current health status and predicting potential health risks. Using advanced deep learning (DL) models with this extensive data opens the possibility of making predictions, such as disease risks, treatment outcomes, and patient prognoses. This possibility equips health care providers with the tools to make informed decisions, ultimately improving patient care, optimizing interventions, and reducing health care costs.</p>
      <p>However, developing DL methods for modeling EHR data is full of challenges. Effectively addressing the complexity of the heterogeneous data extracted from patients’ EHR [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>], capturing short- and long-term relationships between medical codes across various visits, contending with the scarcity of publicly available EHR sources, and navigating the vast diversity of diseases pose significant hurdles. In addition, ensuring transparency and explainability in the predictions made by DL techniques demands substantial effort [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      <p>Current state-of-the-art models for EHR trajectory data are based on transformer architecture [<xref ref-type="bibr" rid="ref8">8</xref>], in particular, models inspired by the Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref9">9</xref>] architecture [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. Such models aim to capture short- and long-term relationships through a task-agnostic representation learning (RL) approach, where the masked language model (MLM) pretraining objective is very common.</p>
      <p>The BERT-inspired models for EHR trajectory data can be examined from various perspectives: data and model size, used data modalities, architecture, and pretraining objectives. Within this context, we focus on the primary and auxiliary pretraining objective functions designed to enhance the capabilities of the learned representation.</p>
      <p>The primary objective typically takes the form of a generative task, benefiting from its enhanced ability to grasp intricate relationships. MLM objectives have found widespread application in EHR trajectory prediction tasks, largely owing to the capabilities of BERT models to learn the context (both past and future simultaneously) [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. The autoregressive pretraining objective [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>] serves as the other approach for RL of EHR trajectory data. It accomplishes this by predicting upcoming medical events, such as the codes for the next day or the subsequent visit, leveraging the patient’s historical data [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref27">27</xref>].</p>
      <p>A range of auxiliary pretraining objectives has been proposed to enhance the RL performance, incorporating either external knowledge or using contrastive learning. Examples of the former include Shang et al [<xref ref-type="bibr" rid="ref15">15</xref>] and Amirahmadi et al [<xref ref-type="bibr" rid="ref28">28</xref>] that predicted medications based on diagnoses and diagnoses based on medication to induce relationships within the diagnoses and interventions in the learned representation. Med-BERT [<xref ref-type="bibr" rid="ref11">11</xref>] introduced a length of stay auxiliary prediction task to enrich contextual information about the severity of patients’ health conditions. CEHRT-BERT [<xref ref-type="bibr" rid="ref16">16</xref>] and Claim-PT [<xref ref-type="bibr" rid="ref27">27</xref>] incorporated visit type predictions (eg, inpatient and outpatient visits) to represent external domain knowledge into the model, mitigating the effect of sparse codes based on the observation that different medical concepts are associated with different visit types.</p>
      <p>RareBERT [<xref ref-type="bibr" rid="ref19">19</xref>] introduced a 1-class classification objective to improve model performance for rare disease prediction. Similarly, AdaDiag [<xref ref-type="bibr" rid="ref18">18</xref>] added a domain classifier to distinguish data from different institutes and enhance the generalizability and robustness of the learned representation against dataset shifts.</p>
      <p>From the contrastive learning category, we find Hierarchical BEHRT (Hi-BEHRT) [<xref ref-type="bibr" rid="ref17">17</xref>] that used bootstrap your own latent (BYOL) [<xref ref-type="bibr" rid="ref29">29</xref>] similarity learning, operating under the assumption that varying augmentations of the same input yield similar representations, thereby enhancing the latent representation of the network. Rapt [<xref ref-type="bibr" rid="ref21">21</xref>] trained the transformer to differentiate between different patient trajectories, relying on the Euclidean distance between their last visits to enrich the RL’s understanding of their health condition. In addition, Rapt used another auxiliary objective, like the next sentence prediction, to discern whether a trajectory belongs to a specific patient or constitutes a fusion of various patient trajectories, facilitating the learning of trends within health trajectories. Generative Adversarial Networks Enhanced Pretraining (GRACE) [<xref ref-type="bibr" rid="ref22">22</xref>], addressing the EHR data insufficiency challenge, incorporated a real or fake contrastive learning objective to distinguish authentic EHR data from generative adversarial network (GAN)–generated EHR data within the MLM framework.</p>
      <p>Instances of medical events can influence the likelihood of other medical events, shaping the trajectory of patients toward more or less severe health conditions. Moreover, numerous medical events exhibit semicausal relationships through chains of probability paths that have not been extensively studied [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. The relative intervals between medical events play a pivotal role in adjusting a patient’s trajectory and are a key factor for the RL model. In pretrained language models, similar concepts are applied to comprehend the global coherence of data. BERT [<xref ref-type="bibr" rid="ref9">9</xref>] used a next-sentence prediction (NSP) to capture the global relations between sentences. However, Liu et al [<xref ref-type="bibr" rid="ref34">34</xref>] showed that NSP does not generate a positive impact, and Lan et al [<xref ref-type="bibr" rid="ref35">35</xref>] speculate that the reason lies in the simplicity of NSP and its overlap with the MLM loss. Consequently, they replaced NSP with sentence order prediction, prioritizing coherence prediction over topic prediction [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. Before the mentioned studies, researchers improved their machine translation, constituency parsing models, and object detection by altering the order of input [<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref40">40</xref>]. Vinyals et al [<xref ref-type="bibr" rid="ref41">41</xref>] delved into the problem that, while in theory incorporating the order of sequences should not have a significant impact when using complex encoders due to their nature as universal approximators, in practice, it does matter due to underlying nonconvex optimization and more fitting priors.</p>
      <p>Furthermore, EHR trajectories encompass a history of abnormal health conditions, including diseases, laboratory data, and prescribed interventions like medications and procedures. The occurrence of certain diseases can alter a person’s health trajectory and increase the probability of other illnesses. Similarly, interventions often mitigate the severity of conditions at the cost of raising other health risks. Thus, every medical event, whether it involves diseases or medications, can serve as a cause, complication, or early symptom of the recorded codes [<xref ref-type="bibr" rid="ref42">42</xref>]. This study will demonstrate that order objectives, besides the context, enhance the model performance by learning more structural information. In summary, the contributions are mentioned in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
      <boxed-text id="box1" position="float">
        <title>Overview of the study's main contributions.</title>
        <list list-type="bullet">
          <list-item>
            <p>We have examined the ability of Bidirectional Encoder Representations from Transformers–inspired models to capture the representation of sequential information of medical codes. Our findings indicate that although transformers and language models excel at identifying global dependencies based on contextual information, learning the order of diseases and medications can be challenging, especially for patients with long trajectories.</p>
          </list-item>
          <list-item>
            <p>We introduced a novel “trajectory order objective” self-supervised auxiliary task to the masked language model (MLM). This new objective was applied at both the single code and visit levels, and we demonstrated its efficacy in enhancing the original MLM by evaluating it on heart failure, Alzheimer disease, and prolonged length of stay downstream prediction tasks on 2 distinct datasets.</p>
          </list-item>
          <list-item>
            <p>We introduced the conditional code swapping and conditional visit swapping functions built on the “conditional-based order of medical codes.” This function allows swapping more frequent consecutive repetitions, enabling the model to systematically learn the patterns of transitions at both the single code and visit levels.</p>
          </list-item>
          <list-item>
            <p>We demonstrated how adding the new objective reshapes the attention behavior of the transformer model and encourages the model to attend to relations between 2 sets rather than 2 individual codes, enabling the learning of more complex structural relationships.</p>
          </list-item>
        </list>
      </boxed-text>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data</title>
        <p>In this study, we extracted medical diagnoses and medication histories from 2 distinct (EHR) trajectory datasets, namely the Medical Information Mart for Intensive Care IV (MIMIC-IV) hosp module [<xref ref-type="bibr" rid="ref43">43</xref>] and the Malmo Diet and Cancer Cohort (MDC) [<xref ref-type="bibr" rid="ref44">44</xref>]. These 2 datasets have unique characteristics that suit our research objectives.</p>
        <p>The MIMIC-IV hosp module encompasses a rich, detailed collection of inpatient EHR trajectories, comprising a total of approximately 173,000 patient records recorded during 407,000 visits. The data spans from 2008 to 2019, offering a comprehensive view of patient journeys within the hospital setting. MIMIC-IV hosp module contains approximately 10.6 million medical codes associated with a large volume of patients. The MDC data is a prospective cohort from Sweden. It consists of approximately 30,000 individuals residing in the municipality of Malmo (southern Sweden) between 1991 and 1996. The cohort was recruited from a total population of about 74,000 individuals, encompassing all men born between 1923 and 1945 and all women born between 1923 and 1950. All inpatient and outpatient visits between 1992 and 2020 have been recorded, resulting in a total of 531,000 visits. Although the MDC dataset has fewer overall samples, it excels in providing a more extensive patient history, averaging 257 codes per patient compared to MIMIC-IV’s 61 (for more details, refer to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <p>Diseases and medications in both datasets are classified using the <italic>ICD</italic> (<italic>International Classification of Diseases and Related Health Problems</italic>) and Anatomical Therapeutic Chemical Code (ATC), respectively. These coding systems follow a hierarchical format, providing granular details about diseases or medications based on code length.</p>
        <p>To facilitate our self-supervised pre-training, supervised fine-tuning, and final testing, we randomly partitioned the extracted cohort into 3 subsets: 70%, 20%, and 10%, respectively. For further details on the specifications of the MIMIC-IV and MDC datasets, refer to <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Summary statistics of the Medical Information Mart for Intensive Care IV (MIMIC-IV) dataset.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Pretraining dataset</td>
                <td>Fine-tuning dataset</td>
                <td>Test dataset</td>
                <td>Total dataset</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Patients, n</td>
                <td>121,000</td>
                <td>36,000</td>
                <td>16,000</td>
                <td>173,000</td>
              </tr>
              <tr valign="top">
                <td>Visits, n</td>
                <td>285,000</td>
                <td>86,000</td>
                <td>37,000</td>
                <td>408,000</td>
              </tr>
              <tr valign="top">
                <td><italic>ICD-9</italic><sup>a</sup> codes, n</td>
                <td>1,940,000</td>
                <td>579,000</td>
                <td>248,000</td>
                <td>2,767,000</td>
              </tr>
              <tr valign="top">
                <td>ATC<sup>b</sup> codes, n</td>
                <td>5,511,000</td>
                <td>1,655,000</td>
                <td>688,000</td>
                <td>7,854,000</td>
              </tr>
              <tr valign="top">
                <td>All codes, n</td>
                <td>7,451,000</td>
                <td>2,234,000</td>
                <td>937,000</td>
                <td>10,622,000</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>ICD-9: International Classification of Diseases, Ninth Revision.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>ATC: Anatomical Therapeutic Chemical Code.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Summary statistics of the Malmo Diet and Cancer Cohort (MDC) dataset.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Pretraining dataset</td>
                <td>Fine-tuning dataset</td>
                <td>Test dataset</td>
                <td>Total dataset</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Patients, n</td>
                <td>21,000</td>
                <td>6000</td>
                <td>3000</td>
                <td>30,000</td>
              </tr>
              <tr valign="top">
                <td>Visits, n</td>
                <td>373,000</td>
                <td>107,000</td>
                <td>52,000</td>
                <td>531,000</td>
              </tr>
              <tr valign="top">
                <td><italic>ICD-10</italic><sup>a</sup> codes, n</td>
                <td>1,155,000</td>
                <td>331,000</td>
                <td>161,000</td>
                <td>1,647,000</td>
              </tr>
              <tr valign="top">
                <td>ATC<sup>b</sup> codes, n</td>
                <td>4,185,000</td>
                <td>1,223,000</td>
                <td>580,000</td>
                <td>5,988,000</td>
              </tr>
              <tr valign="top">
                <td>All codes, n</td>
                <td>5,339,000</td>
                <td>1,554,000</td>
                <td>741,000</td>
                <td>7,634,000</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>ICD-10: International Statistical Classification of Diseases, Tenth Revision.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>ATC: Anatomical Therapeutic Chemical Code.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The use of the MDC dataset for this study was approved by the Ethics Review Board of Sweden (Dnr 2023-00503-01). Regarding the MIMIC-IV dataset, all protected health information (PHI) is officially deidentified. It means that the deletion of PHI from structured data sources (eg, database fields that provide age, genotypic information, and past and current diagnosis and treatment categories) is performed in compliance with the HIPAA (Health Insurance Portability and Accountability Act) standards in order to facilitate public access to the datasets.</p>
      </sec>
      <sec>
        <title>Data Processing and Problem Formulation</title>
        <p>Each dataset D comprises a set of patients P, D = {P<sup>1</sup>,P<sup>2</sup>,...,P<sup>|D|</sup>}. In our study, we considered a total of |D|= 172,980 patients for MIMIC-IV and |D|= 29,664 patients for the MDC cohort. We represent each patient’s longitudinal medical trajectory through a structured set of visit encounters. Given the continuous recording of medical codes in the MDC cohort, we define a visit entity V for each code and all previous subsequent codes occurring within a 6-month time window. For the MIMIC-IV, we used the predefined visits. This representation is denoted as P, P<sup>i</sup> = {V<sup>i</sup><sub>1</sub>, V<sup>i</sup><sub>2</sub>, …, V<sup>i</sup><sub>O</sub>}, where O represents the total number of visit encounters for patient I. For each visit, V<sub>j</sub><sup>i</sup> = I<sub>j</sub> ∪M<sub>j</sub> is the union of all diagnoses codes I<sub>j</sub> ⊂ I and prescribed medications M<sub>j</sub> ⊂ M that are recorded for the P<sup>i</sup> at visit V<sub>j</sub><sup>i</sup>. To reduce the sparsity, we excluded less frequently occurring medical codes and retained only the initial 4 digits of <italic>ICD</italic> and ATC codes (refer to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). This process resulted in 2195 <italic>ICD-9</italic> (<italic>International Classification of Diseases, Ninth Revision</italic>) and 137 ATC-5 unique codes for the MIMIC-IV dataset and 1558 <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) and 111 ATC-5 unique codes for the MDC dataset. In addition, for the MIMIC-IV, we converted medication data from National Drug Code (NDC format to ATC format to benefit from its hierarchical structure and improve comparability.</p>
        <p>To guide the model in understanding changes in encounter times and the structure of each patient’s trajectory, like BERT, we used special tokens. A CLS token is placed at the beginning of each patient's trajectory, while a SEP token is inserted between visits. Consequently, each patient trajectory is represented as <italic>P<sup>i</sup>={CLS, V<sub>1</sub><sup>i</sup>,SEP, V<sub>2</sub><sup>i</sup>,SEP,…,V<sub>0</sub><sup>i</sup>,SEP},</italic> providing the model with valuable context for analysis and prediction.</p>
      </sec>
      <sec>
        <title>Heart Failure Prediction</title>
        <p>The primary downstream task is heart failure (HF) prediction, where the model predicts the incidence of the first HF <italic>ICD</italic> codes I<sub>N=HF</sub> on the Nth visit, given the patient’s previous history of diagnosis and medication intervention:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e68138_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>For each patient’s trajectory, if there were no occurrences of the target disease, it is considered a negative case; otherwise, we excluded the first visit with target codes and all subsequent visits and considered it a positive case. Furthermore, all ATC codes related to HF treatment are excluded. To mitigate trajectory length bias, trajectories with fewer than 10 and 30 visits are excluded from MIMIC IV and the MDC dataset, respectively, ensuring an equal average number of visits within positive and negative cases. Following these preprocessing steps, we obtained Cohorts with a history of 5 visits for MIMIC-IV and 20 visits for the MDC dataset on average. As a remark, in the MDC Cohort, this task is equivalent to predicting HF in the next 6 months, as per the preprocessing design.</p>
      </sec>
      <sec>
        <title>Alzheimer Disease Prediction</title>
        <p>The next downstream task is Alzheimer disease (AD) prediction. Similar to the HF prediction downstream task, the model predicts the occurrence of the first AD <italic>ICD</italic> codes I<sub>N=AD</sub> on the Nth visit, considering the patient’s previous history of diagnoses and medication interventions:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e68138_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>We followed the HF prediction preprocessing steps. The AD downstream prediction task was tested on the MDC dataset.</p>
      </sec>
      <sec>
        <title>Prolonged Length of Stay Prediction</title>
        <p>To explore the adaptability of pretrained models to a distinct task from the one they were initially trained on (code prediction), we used the prediction of prolonged length of stay (PLS) as a binary classification downstream task. In this task, the model is assigned the objective of predicting the PLS in the Nth visit based on a patient’s diagnoses and medications in previous visits in the MIMIC-IV dataset:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e68138_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Consistent with [<xref ref-type="bibr" rid="ref45">45</xref>], patients with a stay longer than 5 days were considered positive cases for PLS. To maintain consistency with the HF prediction task, trajectories with fewer than 3 visits were excluded, and the average number of visits was equalized between positive and negative cases. Following these steps, we obtained a Cohort with an average history of 5 visits (refer to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>Recent studies have underscored the effectiveness of using multihead transformer architecture and MLM self-supervised learning in the domain of EHR trajectory modeling. While these methods have exhibited superior performance in various contexts, we focus on investigating their limitations related to sequential order learning and propose enhancements to address this issue.</p>
      <p>A fundamental aspect of EHR trajectory modeling is the critical role of the sequential order of medical events in guiding patients’ trajectories. For instance, the timely administration of appropriate medical interventions can significantly alter a patient’s trajectory, either improving the severity of their condition or, conversely, leading to unintended side effects.</p>
      <sec>
        <title>Models Objective</title>
        <p>Our approach involves pretraining a transformer model on 2 distinct generative and contrastive self-supervised learning objectives: MLM and Trajectory-Order Objective (TOO). The MLM is crafted to learn the context, while TOO is designed to capture relations between local contexts. By simultaneously training the model on both of these objectives, we aim to leverage the entire set of patient trajectories and acquire a more comprehensive data representation.</p>
        <sec>
          <title>MLM</title>
          <p>The MLM generative task is used to learn the contextual dependencies among medical codes. In this paper, we corrupt the input by randomly masking the medical codes in each patient’s trajectory, denoted as <italic>P<sup>i</sup></italic><sub>corrupted</sub>, and train the model to maximize the likelihood of the masked codes, denoted as x<sup>k</sup>:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v13i1e68138_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>where is the conditional probability modeled by a deep neural network with parameter . We used a sliding window approach across patients’ trajectories to generate additional samples for the MLM objective (refer to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
          <p>In the trajectory order objective, we train the transformer model to learn the relative positions of local features across 2 hierarchical levels, visits and medical codes. The TOO task helps the model gain insights into both causal and noncausal relationships within medical codes and visits. We achieved this by permuting each patient’s trajectory, using them as negative samples, while the unpermuted sequences served as positive examples for the TOO self-supervised contrastive learning task.</p>
          <disp-formula>
            <graphic xlink:href="medinform_v13i1e68138_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>In this equation, state ∈ {permutated, ordered} denotes whether the trajectory <italic>P</italic><sup>i</sup> is ordered or permutated, and y<sub>state</sub> ∈{0,1} is the corresponding label. We implemented the TOO task at the code level and visit level, by code swapping and visit swapping, respectively (refer to <xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Code versus visits swapping. (A) Code swapping does not alter the visit structures of patient trajectories and only substitutes one medical code with another medical code in a different visit. (B) Visits swapping substitutes one visit, along with all its contents, with another visit, further disrupting the relative-time-wise dependencies between diagnoses and medications.</p>
            </caption>
            <graphic xlink:href="medinform_v13i1e68138_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>Code swapping: We initiated code swapping by swapping codes between different visits of a patient’s trajectory. Given the complexity, especially for long trajectories, we designed and implemented 2 distinct methods. First, we randomly selected and swapped a subset of code pairs c<sub>i</sub> and c<sub>j</sub> with uniform probability, called random code swapping (RCS).</p>
          <p>Second, to further facilitate the learning process for the transformer model and guide it toward more meaningful patterns, we introduced a conditional code swapping function (CCS; c<sub>i</sub>, c<sub>j</sub>). The idea is to prioritize code pairs that show a temporal dependency. The CCS function will provide a numerical estimate for such temporal relations, and in practice, code pairs (c<sub>i</sub>, c<sub>j</sub>) with large CCS (c<sub>i</sub>, c<sub>j</sub>) values are sampled more often than pairs with smaller values. We defined the CCS function as follows:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v13i1e68138_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Here, CCnt(c<sub>i</sub>, c<sub>j</sub>) is the count of all occurrences of code pairs (c<sub>i</sub>, c<sub>j</sub>) with the condition that code c<sub>i</sub> appears after code c<sub>j</sub> and that they are located in different visits. The count is performed over all patient trajectories in the pretraining dataset. The max operator in the nominator forces the CCS function to only consider the simple temporal dependencies that code c<sub>i</sub> follows code c<sub>j</sub>. In other words, the maximum operator transforms the bidirectional transition graph between 2 medical codes into a unidirectional graph based on observations in the pretraining dataset. To account for a possible difference in the number of diagnoses and the number of medications, the CCS function was adjusted with a scaling factor S<sub>i,j</sub> (refer to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, for more details). Finally, ε is a small number added to allow for a nonzero probability of selecting code pairs regardless of the relation between CCnt(c<sub>i</sub>, c<sub>j</sub>) and CCnt(c<sub>j</sub>, c<sub>i</sub>). <xref rid="figure2" ref-type="fig">Figure 2</xref> show the CCS values, as a heatmap, for a selection of code pairs (c<sub>i</sub>, c<sub>j</sub>) for both datasets used in this study.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>The conditional code swapping matrix heat map for a subset of medical codes in the medical information mart for Intensive Care IV and Malmo Diet cohort datasets.</p>
            </caption>
            <graphic xlink:href="medinform_v13i1e68138_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Trajectory order objective-Bidirectional Encoder Representations from Transformers architecture and example patient trajectory input. MLM: masked language modeling; TOO: trajectory-order objective.</p>
            </caption>
            <graphic xlink:href="medinform_v13i1e68138_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Visits Swapping</title>
          <p>Visit swapping aims to teach the model the coherence between different levels of local features in the global context. Instead of swapping positions of individual c<sub>i</sub> and c<sub>j</sub> codes, this method involves swapping the positions of all medical codes in visit x with those in visit y. Similar to code swapping, we implemented 2 methods for swapping visits. In the first method, we randomly sampled 2 visits x and y, with a uniform probability, and swapped them, denoted random visit swapping (RVS).</p>
          <p>Second, we introduced the conditional visit swapping (CVS) function to prioritize among the visits to swap. This prioritization is based on the presence of codes within the visits that exhibited the simple temporal relation from the CCS approach as expressed by the CCS function above. To that end, the CVS function is calculated as the sum of CCS scores for all medical codes within visits x and y:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v13i1e68138_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Similar to the way the CCS function was used for code swapping, pairs of visits with large CVS(v<sub>x</sub>,v<sub>y</sub>) values are sampled more often than visit pairs with lower values.</p>
          <p><xref rid="figure2" ref-type="fig">Figure 2</xref> shows the CCS matrix heat map for a subset of medical codes in the MIMIC-IV and MDC datasets. Each row and column represents a medical code, and the heat map indicates the CCS score for all combinations of medical codes. Pairs with higher scores (indicated by lighter colors here) are more likely to be swapped with each other. Due to the long-tail distribution of medical codes and the predominance of less frequent codes, only a subset of the more frequent codes is displayed here. Examples of medical code pairs with the highest CCS scores are printed on the right side. For instance, in the MIMIC-IV dataset, “chronic kidney disease” and “diabetes mellitus” exhibit one of the highest CCS scores, suggesting that kidney disease frequently follows diabetes (but not vice versa). Similarly, in the MDC dataset, “atrial fibrillation” and “essential hypertension” have a high CCS score, indicating that atrial fibrillation often appears after hypertension. Training the model explicitly on such relationships allows it to learn more relevant connections between diseases and medications.</p>
        </sec>
      </sec>
      <sec>
        <title>Model Architecture</title>
        <p>In this study, we used a multihead attention transformer encoder, drawing inspiration from BERT [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. The model architecture, illustrated in <xref rid="figure3" ref-type="fig">Figure 3</xref>, includes an embedding module, a multihead attention transformer encoder, a feed-forward layer, and 2 classifier heads. The embedding module integrates medical codes with their associated temporal information. To capture the temporal dynamics, each medical code is paired with a visit sequence number, represented by 2 types of embeddings: a trainable visit number embedding and a nontrainable sinusoidal embedding, added together. These embeddings are summed with the medical code embeddings to form the input tensor X<sub>0</sub>, which is then passed through a standard transformer encoder to obtain the transformed representation X<sub>1</sub>:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e68138_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>The learned representation is then passed to both the MLM and TOO classifier heads. The MLM classifier is a large softmax applied along the vocabulary dimension, used for predicting masked tokens. The TOO classifier is a binary classifier responsible for predicting the correct temporal order of the medical codes.</p>
      </sec>
      <sec>
        <title>Pretraining TOO-BERT</title>
        <p>In the pretraining phase, we adopted a multitask learning approach to train the transformer network on both the MLM and TOO tasks simultaneously. For each batch, we alternated between these 2 objectives and performed gradient descent-based optimization on the weighted sum of their respective losses. This strategy allowed the model to learn both tasks cohesively, mitigating the risk of catastrophic forgetting that could occur if the objectives were trained sequentially [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref47">47</xref>] (ie, training on MLM first and then on TOO).</p>
        <p>The total loss function for pretraining was defined as follows.</p>
        <p>Each patient trajectory consists of a sequence of diagnoses, medications, and their associated visit sequence numbers, which are processed as input to TOO-BERT. The model includes an embedding layer, a multihead transformer encoder, and 2 classifier heads. First, all medical codes and their temporal information are embedded in the embedding layer. The combined embeddings are then passed through the multihead transformer encoder, followed by the MLM and TOO classifiers.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v13i1e68138_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>where W<sub>MLM</sub> and W<sub>TOO</sub> represent the weights assigned to each loss.</p>
        <p>For the MLM objective, we randomly masked 15% [<xref ref-type="bibr" rid="ref9">9</xref>] of the medical codes (equation 4). Similar to BERT and Med-BERT, during the masking process, each code had an 80% (n/N) chance of being replaced by [Mask], 10% (n/N) by a random code, and 10% (n/N) remained unchanged.</p>
        <p>In the TOO task, we trained the model to classify the permuted trajectories using RCS, CCS, RVS, and CVS methods. We initially evaluated the model’s capability on the TOO task across various permutations.</p>
      </sec>
      <sec>
        <title>Fine-Tuning for Downstream Task</title>
        <p>Following the pretraining phases for trajectory representation learning, we added a Bidirectional Gated Recurrent Unit (Bi-GRU) classifier head on top of the pretrained network, similar to Med-BERT, and finetuned it using the fine-tuning split for each specific downstream task. To enhance the fine-tuning process, we incorporated a layer-wise learning rate decay strategy [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>] with gradient descent to decrease the weight changes in the initial layers compared with the later layers, thereby retaining the basic knowledge acquired during pretraining. In the final step, we compared the performance of our models against Logistic Regression, Random Forest, MLP, Bi-GRU, and a pretrained transformer with only the MLM objective.</p>
      </sec>
      <sec>
        <title>Implementation Details</title>
        <p>We set aside 10% (12000/121000 in the MIMIC-IV and 2000/21000 in the MDC dataset) of the pretraining dataset for monitoring the transformer’s performance on the MLM and TOO pretraining objectives. The fine-tuning dataset was divided into 5 splits. For each iteration, we fine-tuned the pretrained models and trained the baseline models on 4 splits, using the remaining portion for early stopping. The reported results represent the average and SD of the performance across the 5 trained models on the isolated test dataset.</p>
        <p>For the pretraining phase, we used a neural network featuring 5 self-attention heads and one transformer encoder with a d<sub>k</sub> = d<sub>v</sub> = d<sub>x</sub> = 36 (refer to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), comprising approximately 300,000 learnable parameters. In each dataset, we computed the length of the trajectory for all patients and considered the 0.7 quantiles of the trajectory lengths as the maximum sequence length. For trajectories exceeding this length, we applied a moving window to generate additional augmented pretraining samples. We used the Adam optimizer with a learning rate of 7e-5, a weight decay of 0.015, and a dropout rate of 0.1, and trained the model until the loss curve stabilized. The MLP network comprises 2 hidden layers with 250 and 100 nodes, and the Bi-GRU network features one bidirectional GRU layer with 64 hidden nodes.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Evaluation of Pretraining on the TOO Auxiliary Task</title>
        <p>We assessed the effectiveness of the transformer models in learning the proposed TOO auxiliary objectives through a series of experiments conducted on the MDC and MIMIC-IV datasets. The four proposed swapping methods (RCS, CCS, RVS, and CVS) were applied with different amounts of swapped code or visit pairs, and the models’ performance in detecting whether a trajectory contained swapped codes or visits was evaluated under varying amounts of swapping. <xref rid="figure4" ref-type="fig">Figure 4</xref> illustrates the impact of increasing the percentage of swaps on classification performance for each dataset and swapping method.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>The accuracy of the transformer model in classifying various types of swapping during the pretraining phase on the 10% unseen data from the pretraining split is shown for MDC and MIMIC-IV datasets. (A) The pretrained model can classify permuted samples with even a very low percentage of swapping on the MIMIC-IV dataset. On the other hand, classifying the permuted samples on the MDC was quite challenging. (B) The classification accuracy of the visits-swapped samples increases by raising the number of swapped visits for both methods and both datasets. MDC: Malmo Diet and Cancer Cohort; MIMIC-IV: Medical Information Mart for Intensive Care IV.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e68138_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>As shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>A, for the MIMIC-IV dataset, the transformer model could easily detect swapped trajectories using RCS and CCS methods for swap percentages above 40% (25/62). In contrast, the MDC dataset presented a different challenge. MDC trajectories are, on average, approximately 8 times longer (in terms of visits per patient) than those in MIMIC-IV. Here, the model struggled with the RCS, failing to find solutions effectively across any percentage of swapped pairs. In the CCS task, the model began to classify swapped trajectories more accurately only when the percentage of swaps exceeded 45% (102/254).</p>
        <p>The results for the visit-swapping tasks (RVS and CVS) are presented in <xref rid="figure4" ref-type="fig">Figure 4</xref>B. These tasks proved more challenging for the models, especially at lower percentages of swaps. For the CVS task in the MDC dataset, the model required over 80% (14/18) of visits to be swapped before achieving satisfactory classification performance. Conversely, in the RVS task, the model achieved 0.8 accuracy with just 40% (7/18) of swapped visits.</p>
        <p>For the MIMIC-IV dataset, the CVS task remained difficult for the transformer models, with only a single visit swap (1/2, 42%). However, in the RVS task, the model successfully classified trajectories with only one visit swap, demonstrating greater ease in learning visit-level temporal disruptions in this dataset.</p>
        <p>To improve the pretraining performance on the MDC cohort with the RCS swapping method, we used a transfer learning approach. Initial weights from the transformer using the CCS method were used for the RCS task with 45% (102/254) code swapping, and this resulted in an accuracy of 0.684. This approach was also applied during the fine-tuning step for MLM+TOO<sub>RCS</sub> on the MDC dataset.</p>
      </sec>
      <sec>
        <title>Evaluation of Downstream Tasks</title>
        <p>We evaluated the prediction performance of HF and PLS for the MIMIC-IV dataset, while for the MDC dataset, we evaluated the HF and AD prediction performance. The datasets’ specifications and the downstream tasks definitions are described in the section data. The percentages of swapping used for RCS, CCS, RVS, and CVS during pretraining are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, and the selection was based on the performance on the validation performance within the fine-tuning dataset.</p>
        <p><xref ref-type="table" rid="table3">Table 3</xref> shows the performance of Logistic Regression, Random Forest, MLP, Bi-GRU, the transformer pretrained with the MLM auxiliary task, and the 4 variations of the MLM+TOO auxiliary tasks. Consistent with the results of the Med-BERT model [<xref ref-type="bibr" rid="ref11">11</xref>], the transformer pretrained with only MLM outperformed almost all other conventional methods in all downstream tasks for both datasets.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Average AUC<sup>a</sup> values (%) and SD for different methods for the HF<sup>b</sup> prediction, AD<sup>c</sup> prediction, and PLS<sup>d</sup> prediction downstream tasks on the test datasets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Model or dataset</td>
                <td>HF prediction (MDC<sup>e</sup>)</td>
                <td>AD prediction (MDC)</td>
                <td>HF prediction (MIMIC-IV<sup>f</sup>)</td>
                <td>PLS prediction (MIMIC-IV)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>62.4 (1.1)</td>
                <td>56.4 (1.1)</td>
                <td>83.8 (1.1)</td>
                <td>54.2 (0.4)</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>60.7 (0.5)</td>
                <td>51.8 (0.3)</td>
                <td>78.6 (1.6)</td>
                <td>51.1 (0.3)</td>
              </tr>
              <tr valign="top">
                <td>MLP</td>
                <td>67.9 (3.0)</td>
                <td>68.0 (1.5)</td>
                <td>86.0 (0.5)</td>
                <td>59.3 (1.9)</td>
              </tr>
              <tr valign="top">
                <td>Bi-GRU</td>
                <td>62.3 (1.2)</td>
                <td>60.4 (1.1)</td>
                <td>85.0 (1.3)</td>
                <td>55.9 (1.0)</td>
              </tr>
              <tr valign="top">
                <td>MLM<sup>g</sup></td>
                <td>67.7 (2.6)</td>
                <td>69.5 (1.6)</td>
                <td>86.2 (0.9)</td>
                <td>60.2 (1.2)</td>
              </tr>
              <tr valign="top">
                <td> MLM+TOO<sup>h</sup><sub>RCS</sub><sup>i</sup></td>
                <td>65.1 (1.2)</td>
                <td>65.6 (0.7)</td>
                <td>88.1 (0.7)</td>
                <td>58.4 (0.9)</td>
              </tr>
              <tr valign="top">
                <td> MLM+TOO<sub>CCS</sub><sup>j</sup></td>
                <td>64.6 (2.7)</td>
                <td>67.2 (1.3)</td>
                <td>89.8 (0.8)</td>
                <td>60.4 (1.2)</td>
              </tr>
              <tr valign="top">
                <td> MLM+TOO<sub>RVS</sub><sup>k</sup></td>
                <td>72.8 (3.1)</td>
                <td>70.4 (0.2)</td>
                <td>87.9 (1.7)</td>
                <td>57.3 (0.8)</td>
              </tr>
              <tr valign="top">
                <td> MLM+TOO<sub>CVS</sub><sup>l</sup></td>
                <td>73.9 (1.9)</td>
                <td>71.9 (1.6)</td>
                <td>87.2 (1.8)</td>
                <td>58.8 (1.6)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>AUC: area under the receiver operating characteristic curve.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>HF: heart failure.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>AD: Alzheimer disease.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>PLS: prolonged length of stay.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>MDC: Malmo Diet and Cancer Cohort.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>MIMIC-IV: Medical Information Mart for Intensive Care IV.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>f</sup>gLM: masked language modelling.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>TOO: trajectory-order objective.</p>
            </fn>
            <fn id="table3fn9">
              <p><sup>i</sup>RCS: random code swapping.</p>
            </fn>
            <fn id="table3fn10">
              <p><sup>j</sup>CCS: code swapping function.</p>
            </fn>
            <fn id="table3fn11">
              <p><sup>j</sup>RVS: random visit swapping.</p>
            </fn>
            <fn id="table3fn12">
              <p><sup>l</sup>CVS: conditional visit swapping.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The visit-level TOO objective methods yielded the best HF and AD prediction performance on the MDC dataset, which features much longer patient trajectories. Using the CVS for the TOO objective achieved the highest area under the receiver operating characteristic curve (AUC) of 0.739 and 0.719 for HF and AD prediction, respectively. Moreover, since the model pretrained with random code swapping exhibited weak performance on the MDC dataset (<xref rid="figure4" ref-type="fig">Figure 4</xref>A), we initialized the model with the weights of the CCS pretrained model. We achieved approximately 0.74 accuracy on the TOO objective and subsequently fine-tuned this model for HF prediction, increasing the AUC to 0.65.9. However, this result remained lower than that of the transformer pretrained solely on MLM.</p>
        <p>For HF prediction on the MIMIC-IV dataset, the TOO auxiliary task with code swapping improved performance more than other methods. Applying the CCS over the TOO objective achieved the best AUC of 0.89.8. Although predicting the PLS on the next visit is challenging for all models, adding the CCS objective led to the best AUC.</p>
      </sec>
      <sec>
        <title>Performance Boost on Data Insufficiency</title>
        <p>We further evaluated the impact of combining the proposed TOO with the MLM, using varying fine-tuning sample sizes for predicting HF on the MIMIC-IV test dataset, and compared its performance to the transformer pretrained on MLM and MLP, the most successful conventional method. Fine-tuning sample sizes was reduced to 50% (442/884), 20% (176/884), and 10% (88/884). <xref rid="figure5" ref-type="fig">Figure 5</xref> presents the performance of the MLP (orange line), the transformer pretrained with only MLM (red line), and transformers pretrained with MLM combined with RCS (blue line), CCS (green line), RVS (black line), and CVS (pink line). As the sample size decreased, the transformer pretrained with MLM+TOO<sub>CCS</sub> achieved higher AUC scores, highlighting its effectiveness in handling data insufficiency.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Comparison of HF prediction AUC values for the test sets by fine-tuning on different data sizes on the MIMIC-IV dataset. The shadows represent the 90% CI. AUC: area under the receiver operating characteristic curve; HF: heart failure; MIMIC-IV: Medical Information Mart for Intensive Care IV. MLM: masked language modeling; MLP: multilayer perceptron; TOO: trajectory-order objective.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e68138_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>The Effect of TOO on Attention Weights</title>
        <p>Visualizing the attention scores, A<sub>h</sub>, of the transformer provides valuable insights into the model’s decision-making process and its representation learning capabilities. More capable models can learn and attend to more complex patterns. <xref rid="figure6" ref-type="fig">Figure 6</xref> shows attention scores for the fine-tuned models based on only MLM, MLM+TOO<sub>RVS</sub>, and MLM+TOO<sub>CCS</sub> pretraining. The attention scores come from a single patient trajectory for the HF prediction task on the MIMIC-IV dataset. Attention scores for all TOO-BERT variants can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In the rightmost column, the attention scores of the model pretrained solely with MLM show a primary focus on the latest codes in the trajectory. In contrast, models pretrained with the TOO objective (shown in the first and second columns) exhibit more diverse attention patterns, capturing more complex and structured relationships across the trajectory. In addition, models pretrained with the visit-level TOO objective (RVS) demonstrated an increased focus on the interactions between sets of consecutive codes (ie, segment-level attention). In contrast, the model pretrained with the CCS objective tended to exhibit attention at the individual code level.</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>The attention scores (5 heads) for 3 fine-tuned models on HF prediction for the MIMIC-IV dataset, shown for a specific sample from the test set. HF: heart failure; MIMIC-IV: Medical Information Mart for Intensive Care IV.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e68138_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>In <xref rid="figure6" ref-type="fig">Figure 6</xref>, lighter colors in the heatmap indicate higher attention weights, while darker colors represent lower attention weights. The variations within each model’s attention heads are meaningful, reflecting how the model allocates attention to specific medical codes within the trajectory. The attention scores of the model pretrained on MLM+CCS demonstrate a greater ability to learn complex patterns. For better interpretability, the attention scores for each head are normalized between 0 and 1, though the original values range from 0 to 0.033, with slight variations across different models.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The sequential order of the medical codes and their interactions within a patient’s EHR trajectory is crucial for understanding and modeling their health status. While BERT-inspired methods have shown good results in this domain, the challenge lies in capturing the intricate relationships between diseases and prescribed interventions. TOO-BERT enhances the performance of BERT-inspired models by simultaneously learning the order of medical codes and the context of the EHR trajectory. Our approach involves pretraining these models explicitly on the sequential information within a patient’s EHR trajectory, aiming to enhance their learning capability by using the temporal structure. In addition, the introduction of 2 novel weighting methods, CCS and CVS, within the TOO objective enables the models to learn more relevant and frequent causal and correlation dependencies with a data-driven approach.</p>
        <p>The pretraining results highlight the differences in learning sequential information between the MIMIC-IV and MDC datasets. The MDC dataset, characterized by an average of approximately 18 visits per patient, presented more challenges in learning single code level sequential information compared to the MIMIC-IV dataset, which has an average of about 2.5 visits per patient (<xref rid="figure4" ref-type="fig">Figure 4</xref>A). This discrepancy could stem from the tendencies of transformers to learn the global dependencies and might require additional strategies to capture local patterns as well [<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref53">53</xref>]. In addition, the performance of the model initialized with weights from the CCS task in the MDC dataset on the RCS task demonstrated that the proposed conditional probability approach can effectively help the model converge (section Evaluation of pretraining on the TOO auxiliary task).</p>
        <p>Combining the proposed TOO with the MLM improved estimated AUC values for all downstream tasks on both datasets. Interestingly, in longer trajectories, visit-level swapping seemed more informative than code-level swapping, suggesting that the TOO auxiliary pretraining objective may improve the efficacy of the transformer in modeling long EHR trajectory data, in addition to suggested architectural improvements [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. Moreover, code-level TOO-BERT reduced the performance of MLM in the MDC dataset, possibly due to the increase in MLM loss associated with adding the TOO auxiliary task during pretraining. Transformers trained solely on MLM demonstrated similar performance to MLP, indicating the complexity of EHR trajectories and data insufficiency in pretraining these models. The addition of the TOO task leveraged the MLM, potentially compensating for data insufficiency in complex models.</p>
        <p>Predicting PLS from previous visits based on diagnoses and medications proved to be a particularly challenging task for all models. In addition, while previous research has indicated that BERT-inspired models are excellent few-shot learners [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref56">56</xref>], the addition of the TOO auxiliary task showed superior performance with reduced fine-tuning sample sizes for HF prediction in the MIMIC-IV dataset.</p>
        <p>The CCS and CVS weighting function enhances the learning process by prioritizing more frequent transitional patterns. This prioritization helps the model focus on meaningful dependencies, such as transitions where the occurrence of one event strongly predicts the occurrence of another. By emphasizing these strong correlations, the model can converge more efficiently and avoid learning from rare or noisy transitions that may not represent meaningful relationships.</p>
        <p>This approach is particularly beneficial in the context of small datasets. CCS and CVS ensure the model concentrates on the most informative patterns early in training, which not only facilitates convergence but also helps reduce overfitting to spurious relationships. While larger datasets and highly expressive models may eventually learn such relationships without CCS, the function remains valuable for guiding the model toward robustness in more resource-constrained settings. The other way to conceptualize CCS and CVS is the resemblance to first-order and higher-order Markov chains. CCS amplifies the probability weight of swapping a code pair based on the observation that the occurrence of one code increases the probability of observing the other in future visits. Similarly, CVS approximates a higher-order Markov chain by considering a set of conditions [<xref ref-type="bibr" rid="ref57">57</xref>].</p>
        <p>An interesting finding in our study was that training transformers on sequential information enables them to learn more intricate structures. The variability in the size and number of tiles in the attention weights (<xref rid="figure6" ref-type="fig">Figure 6</xref>) suggests that the TOO-objective enabled the transformers to learn a wide range of patterns. However, a quantitative analysis approach would be more suitable for gaining a more concrete understanding of the attention behaviors.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations. The TOO task only considers the order of the medical codes and skips the time irregularity of visits in the EHR [<xref ref-type="bibr" rid="ref58">58</xref>-<xref ref-type="bibr" rid="ref60">60</xref>]. Extending the investigation of the new TOO-BERT variant to other datasets with larger sample sizes and longer visit trajectories would enhance our understanding of the differences between code and visit-level swapping pretraining objectives. Furthermore, the inclusion of additional EHR data sources with various modalities, such as test result values and demographic information, with continuous and categorical data types, in TOO-BERT requires further exploration.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>Future research directions may involve a more comprehensive investigation of the challenges associated with the lack of locality in transformers and the exploration of more sample-efficient techniques to enhance the performance of TOO-BERT methods in data-limited scenarios. Enhanced positional encoding techniques and transformer architectures could prove beneficial. The impact of history length, influenced by code and visit level swapping, could be examined by pretraining TOO-BERT on larger datasets with longer visit histories. Furthermore, assessing the performance of pretrained TOO-BERT on other types of downstream tasks or tasks subject to dataset shifts would be valuable.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>In this study, we explored the potential of incorporating the relative positions of medical codes to improve the learned representation of intricate disease-intervention dependencies, especially in scenarios involving lengthy sequences and limited data. Our introduction of TOO-BERT extends the capabilities of the MLM by focusing on the sequential information within patients’ trajectories at both the single code and visit levels. In addition, to enhance the TOO objective, we introduced condition-based code and swapped self-supervised tasks. The outcomes highlight TOO-BERT’s superior performance in predicting PLS, AD, and HF across different sample sizes. Our analysis of attention weights reveals that the TOO task equips transformers to grasp more intricate structural patterns. Future research might involve exploring more sample-efficient pretraining methods and refining transformer architecture and positional encoding to enhance TOO-BERT’s representation learning capabilities further.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Additional material.</p>
        <media xlink:href="medinform_v13i1e68138_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 7441 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AD</term>
          <def>
            <p>Alzheimer disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ATC</term>
          <def>
            <p>Anatomical Therapeutic Chemical Code</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AUC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">Bi-GRU</term>
          <def>
            <p>Bidirectional Gated Recurrent Unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">BYOL</term>
          <def>
            <p>Bootstrap Your Own Latent</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">CCS</term>
          <def>
            <p>conditional code swapping</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">CVS</term>
          <def>
            <p>Conditional Visit Swapping</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">DL</term>
          <def>
            <p>deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">GAN</term>
          <def>
            <p>generative adversarial network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">GRACE</term>
          <def>
            <p>Generative Adversarial Networks Enhanced Pretraining</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">HF</term>
          <def>
            <p>heart failure</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">Hi-BEHRT</term>
          <def>
            <p>Hierarchical BEHRT</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">HIPAA</term>
          <def>
            <p>Health Insurance Portability and Accountability Act</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">ICD</term>
          <def>
            <p>International Classification of Diseases and Related Health Problems</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">ICD-9</term>
          <def>
            <p>International Classification of Diseases, Ninth Revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">ICD-10</term>
          <def>
            <p>International Statistical Classification of Diseases, Tenth Revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">MDC</term>
          <def>
            <p>Malmo Diet and Cancer Cohort</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">MIMIC-IV</term>
          <def>
            <p>Medical Information Mart for Intensive Care IV</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">MLM</term>
          <def>
            <p>masked language modeling</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb22">NDC</term>
          <def>
            <p>National Drug Code</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb23">NSP</term>
          <def>
            <p>next-sentence prediction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb24">PHI</term>
          <def>
            <p>protected health information</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb25">PLS</term>
          <def>
            <p>prolonged length of stay</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb26">RCS</term>
          <def>
            <p>random code swapping</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb27">RL</term>
          <def>
            <p>representation learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb28">RVS</term>
          <def>
            <p>random visit swapping</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb29">TOO</term>
          <def>
            <p>trajectory-order objective</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb30">TOO-BERT</term>
          <def>
            <p>Trajectory Order Objective Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was conducted as part of the AIR Lund (Artificially Intelligent use of Registers at Lund University) research environment and was funded by the Swedish Research Council (VR, grant 2019-00198). Additional support was provided by CAISR Health, funded by the Knowledge Foundation (KK-stiftelsen) in Sweden (grant 20200208 01 H).</p>
      <p>We shared the Trajectory Order Objective Bidirectional Encoder Representations from Transformers (TOO-BERT) source code on TOO-BERT [<xref ref-type="bibr" rid="ref61">61</xref>]. The pre-trained models are available from the authors upon request and with permission from the Medical Information Mart for Intensive Care IV (MIMIC-IV) and the Malmo Population-Based Cohorts Joint Database.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The MIMIC-IV dataset is publicly available from the PhysioNet repository [<xref ref-type="bibr" rid="ref62">62</xref>]. The Malmo Diet and Cancer Cohort data that support the findings of this study are not publicly available due to data access restrictions imposed by the Malmo Population-Based Cohorts Joint Database. However, the data are available from the corresponding author upon reasonable request and with permission from the Malmo Population-Based Cohorts Joint Database [<xref ref-type="bibr" rid="ref63">63</xref>].</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>All persons who meet authorship criteria are listed as authors, and all authors certify that they have participated sufficiently in the work to take public responsibility for the content, including participation in the concept, design, analysis, writing, or revision of the manuscript. AA managed conceptualization, methodology, data curation, validation, formal analysis, writing-original draft, writing-review, editing, and visualization.</p>
        <p>FE contributed to the investigation, conceptualization, methodology, validation, supervision, and writing—review and editing. JB was involved in validation, supervision, and writing—review and editing. OM contributed to validation, supervision, and writing—review and editing. MO contributed to the investigation, conceptualization, methodology, validation, supervision, writing—review and editing, as well as project administration.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miotto</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kidd</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Dudley</surname>
              <given-names>JT</given-names>
            </name>
          </person-group>
          <article-title>Deep patient: An unsupervised representation to predict the future of patients from the electronic health records</article-title>
          <source>Sci Rep</source>
          <year>2016</year>
          <volume>6</volume>
          <fpage>26094</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/srep26094"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/srep26094</pub-id>
          <pub-id pub-id-type="medline">27185194</pub-id>
          <pub-id pub-id-type="pii">srep26094</pub-id>
          <pub-id pub-id-type="pmcid">PMC4869115</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bahadori</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kulas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schuetz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>RETAIN: an interpretable predictive model for healthcare using reverse time attention mechanism</article-title>
          <year>2016</year>
          <conf-name>NIPS'16: Proceedings of the 30th International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>2016 December 05</conf-date>
          <conf-loc>Red Hook, NY, United States</conf-loc>
          <fpage>3512</fpage>
          <lpage>3520</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chitta</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>You</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Dipole: Diagnosis prediction in healthcare via attention-based bidirectional recurrent neural networks</article-title>
          <year>2017</year>
          <conf-name>KDD '17: Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>2017 August 13</conf-date>
          <conf-loc>NS, Halifax, Canada</conf-loc>
          <fpage>1903</fpage>
          <lpage>1911</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>HiTANet: Hierarchical time-aware attention networks for risk prediction on electronic health records</article-title>
          <year>2020</year>
          <conf-name>KDD '20: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery &amp; Data Mining</conf-name>
          <conf-date>2020 August 20</conf-date>
          <conf-loc>CA, Virtual Event, USA</conf-loc>
          <fpage>647</fpage>
          <lpage>56</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Opportunities and challenges in developing deep learning models using electronic health records data: a systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <volume>25</volume>
          <issue>10</issue>
          <fpage>1419</fpage>
          <lpage>1428</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29893864"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocy068</pub-id>
          <pub-id pub-id-type="medline">29893864</pub-id>
          <pub-id pub-id-type="pii">5035024</pub-id>
          <pub-id pub-id-type="pmcid">PMC6188527</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Amirahmadi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ohlsson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Etminani</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Deep learning prediction models based on EHR trajectories: a systematic review</article-title>
          <source>J Biomed Inform</source>
          <year>2023</year>
          <volume>144</volume>
          <fpage>104430</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(23)00151-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2023.104430</pub-id>
          <pub-id pub-id-type="medline">37380061</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(23)00151-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Si</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jim Zheng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Deep representation learning of patient data from Electronic Health Records (EHR): A systematic review</article-title>
          <source>J Biomed Inform</source>
          <year>2021</year>
          <volume>115</volume>
          <fpage>103671</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(20)30299-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2020.103671</pub-id>
          <pub-id pub-id-type="medline">33387683</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(20)30299-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC11290708</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>Ł</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <year>2017</year>
          <conf-name>NIPS'17: Proceedings of the 31st International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>2017 December 04</conf-date>
          <conf-loc>Red Hook, NY, United States</conf-loc>
          <fpage>6000</fpage>
          <lpage>6010</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M-W</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>arXiv:1810.04805</source>
          <year>2018</year>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Solares</surname>
              <given-names>JRA</given-names>
            </name>
            <name name-style="western">
              <surname>Hassaine</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ramakrishnan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Canoy</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rahimi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Salimi-Khorshidi</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>BEHRT: Transformer for electronic health records</article-title>
          <source>Sci Rep</source>
          <year>2020</year>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>7155</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-020-62922-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-020-62922-y</pub-id>
          <pub-id pub-id-type="medline">32346050</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-020-62922-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC7189231</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rasmy</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhi</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Med-BERT: pretrained contextualized embeddings on large-scale structured electronic health records for disease prediction</article-title>
          <source>NPJ Digit Med</source>
          <year>2021</year>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>86</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-021-00455-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-021-00455-y</pub-id>
          <pub-id pub-id-type="medline">34017034</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-021-00455-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC8137882</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Speier</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Arnold</surname>
              <given-names>CW</given-names>
            </name>
          </person-group>
          <article-title>Bidirectional representation learning from transformers using multimodal electronic health record data to predict depression</article-title>
          <source>IEEE J Biomed Health Inform</source>
          <year>2021</year>
          <volume>25</volume>
          <issue>8</issue>
          <fpage>3121</fpage>
          <lpage>3129</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33661740"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/JBHI.2021.3063721</pub-id>
          <pub-id pub-id-type="medline">33661740</pub-id>
          <pub-id pub-id-type="pmcid">PMC8606118</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y-P</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>Y-H</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>C-H</given-names>
            </name>
          </person-group>
          <article-title>Disease concept-embedding based on the self-supervised method for medical information extraction from electronic health records and disease retrieval: Algorithm development and validation study</article-title>
          <source>J Med Internet Res</source>
          <year>2021</year>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>e25113</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2021/1/e25113/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/25113</pub-id>
          <pub-id pub-id-type="medline">33502324</pub-id>
          <pub-id pub-id-type="pii">v23i1e25113</pub-id>
          <pub-id pub-id-type="pmcid">PMC7875703</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wamil</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hassaine</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mamouei</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Canoy</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nazarzadeh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bidel</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Copland</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Rahimi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Salimi-Khorshidi</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Stratification of diabetes in the context of comorbidities, using representation learning and topological data analysis</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>11478</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-38251-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-38251-1</pub-id>
          <pub-id pub-id-type="medline">37455284</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-38251-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC10350454</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Pre-training of graph augmented transformers for medication recommendation</article-title>
          <source>arXiv:1906.00346</source>
          <year>2019</year>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Kalluri</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Spotnitz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Perotte</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>CEHR-BERT: Incorporating temporal information from structured EHR data to improve prediction tasks</article-title>
          <source>Proceedings of Machine Learning for Health, PMLR</source>
          <year>2021</year>
          <volume>158</volume>
          <fpage>239</fpage>
          <lpage>260</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mamouei</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Salimi-Khorshidi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hassaine</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Canoy</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lukasiewicz</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Rahimi</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Hi-BEHRT: Hierarchical transformer-based model for accurate prediction of clinical events using multimodal longitudinal electronic health records</article-title>
          <source>IEEE J Biomed Health Inform</source>
          <year>2023</year>
          <volume>27</volume>
          <issue>2</issue>
          <fpage>1106</fpage>
          <lpage>1117</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36427286"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/JBHI.2022.3224727</pub-id>
          <pub-id pub-id-type="medline">36427286</pub-id>
          <pub-id pub-id-type="pmcid">PMC7615082</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bui</surname>
              <given-names>AAT</given-names>
            </name>
          </person-group>
          <article-title>AdaDiag: Adversarial domain adaptation of diagnostic prediction with clinical event sequences</article-title>
          <source>J Biomed Inform</source>
          <year>2022</year>
          <volume>134</volume>
          <fpage>104168</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(22)00179-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2022.104168</pub-id>
          <pub-id pub-id-type="medline">35987449</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(22)00179-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC9580228</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Prakash</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chilukuri</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ranade</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Viswanathan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>RareBERT: Transformer architecture for rare disease patient identification using administrative claims</article-title>
          <source>Proc. AAAI Conf. Artif. Intell</source>
          <year>2021</year>
          <volume>35</volume>
          <issue>1</issue>
          <fpage>453</fpage>
          <lpage>460</lpage>
          <pub-id pub-id-type="doi">10.1609/aaai.v35i1.16122</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Choudhury</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tipirneni</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mukherjee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ham</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tamang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Baker</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kocaman</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Gevaert</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Rallo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>CK</given-names>
            </name>
          </person-group>
          <article-title>Preparing for the next pandemic via transfer learning from existing diseases with hierarchical multi-modal BERT: a study on COVID-19 outcome prediction</article-title>
          <source>Sci Rep</source>
          <year>2022</year>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>10748</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-022-13072-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-022-13072-w</pub-id>
          <pub-id pub-id-type="medline">35750878</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-022-13072-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC9232529</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>RAPT: Pre-training of time-aware transformer for learning robust healthcare representation</article-title>
          <year>2021</year>
          <conf-name>KDD '21: Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery &amp; Data Mining</conf-name>
          <conf-date>2021 August 14</conf-date>
          <conf-loc>Virtual Event, Singapore</conf-loc>
          <fpage>3503</fpage>
          <lpage>3511</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Generative adversarial networks enhanced pre-training for insufficient electronic health records modeling</article-title>
          <year>2022</year>
          <conf-name>KDD '22: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>2022 August 14</conf-date>
          <conf-loc>Washington DC, USA</conf-loc>
          <fpage>3810</fpage>
          <lpage>3818</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Child</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Luan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Amodei</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Language models are unsupervised multitask learners</article-title>
          <source>OpenAI blog</source>
          <year>2019</year>
          <volume>1</volume>
          <issue>8</issue>
          <fpage>9</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Neelakantan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shyam</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sastry</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Language models are few-shot learners</article-title>
          <year>2020</year>
          <conf-name>NIPS'20: Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>2020 December 06</conf-date>
          <conf-loc>Red Hook, NY, United States</conf-loc>
          <fpage>1877</fpage>
          <lpage>901</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>LL</given-names>
            </name>
            <name name-style="western">
              <surname>Steinberg</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Fleming</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Posada</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lemmon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fries</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>EHR foundation models improve robustness in the presence of temporal distribution shift</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>3767</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-30820-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-30820-8</pub-id>
          <pub-id pub-id-type="medline">36882576</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-30820-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC9992466</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Steinberg</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fries</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Corbin</surname>
              <given-names>CK</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
          </person-group>
          <article-title>Language models are an effective representation learning technique for electronic health record data</article-title>
          <source>J Biomed Inform</source>
          <year>2021</year>
          <volume>113</volume>
          <fpage>103637</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(20)30265-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2020.103637</pub-id>
          <pub-id pub-id-type="medline">33290879</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(20)30265-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC7863633</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Linwood</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Pretrained transformer framework on pediatric claims data for population specific tasks</article-title>
          <source>Sci Rep</source>
          <year>2022</year>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>3651</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-022-07545-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-022-07545-1</pub-id>
          <pub-id pub-id-type="medline">35256645</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-022-07545-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC8901645</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Amirahmadi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ohlsson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Etminani</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Melander</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Bjork</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A Masked language model for multi-source EHR trajectories contextual representation learning</article-title>
          <source>arXiv:2402.06675</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grill</surname>
              <given-names>J-B</given-names>
            </name>
            <name name-style="western">
              <surname>Strub</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Altché</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Tallec</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Richemond</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Buchatskaya</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Doersch</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Pires</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>ZD</given-names>
            </name>
            <name name-style="western">
              <surname>Azar</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Piot</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kavukcuoglu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Munos</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Valko</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Bootstrap your own latent a new approach to self-supervised learning</article-title>
          <year>2020</year>
          <conf-name>NIPS'20: Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>2020 December 06</conf-date>
          <conf-loc>Red Hook, NY, United States</conf-loc>
          <fpage>33</fpage>
          <lpage>84</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vo</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>NTK</given-names>
            </name>
            <name name-style="western">
              <surname>Kha</surname>
              <given-names>QH</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>NQK</given-names>
            </name>
          </person-group>
          <article-title>On the road to explainable AI in drug-drug interactions prediction: a systematic review</article-title>
          <source>Comput Struct Biotechnol J</source>
          <year>2022</year>
          <volume>20</volume>
          <fpage>2112</fpage>
          <lpage>2123</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2001-0370(22)00138-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.csbj.2022.04.021</pub-id>
          <pub-id pub-id-type="medline">35832629</pub-id>
          <pub-id pub-id-type="pii">S2001-0370(22)00138-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC9092071</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Doherty</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <article-title>Algorithms for assessing the probability of an adverse drug reaction</article-title>
          <source>Respiratory Medicine CME</source>
          <year>2009</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>63</fpage>
          <lpage>67</lpage>
          <pub-id pub-id-type="doi">10.1016/j.rmedc.2009.01.004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Harthi</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Osman</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Sattar</surname>
              <given-names>MAAA</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Dilemmas of the causality assessment tools in the diagnosis of adverse drug reactions</article-title>
          <source>Saudi Pharm J</source>
          <year>2016</year>
          <volume>24</volume>
          <issue>4</issue>
          <fpage>485</fpage>
          <lpage>493</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1319-0164(15)00011-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jsps.2015.01.010</pub-id>
          <pub-id pub-id-type="medline">27330379</pub-id>
          <pub-id pub-id-type="pii">S1319-0164(15)00011-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC4908100</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rizzoli</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Reginster</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boonen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bréart</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Diez-Perez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Felsenberg</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kaufman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kanis</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Adverse reactions and drug-drug interactions in the management of women with postmenopausal osteoporosis</article-title>
          <source>Calcif Tissue Int</source>
          <year>2011</year>
          <volume>89</volume>
          <issue>2</issue>
          <fpage>91</fpage>
          <lpage>104</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/21637997"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00223-011-9499-8</pub-id>
          <pub-id pub-id-type="medline">21637997</pub-id>
          <pub-id pub-id-type="pmcid">PMC3135835</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Roberta: A robustly optimized bert pretraining approach</article-title>
          <source>arXiv:1907.11692</source>
          <year>2019</year>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gimpel</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Soricut</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Albert: A lite bert for self-supervised learning of language representations</article-title>
          <source>arXiv:1909.11942</source>
          <year>2019</year>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Mian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Self-supervised Learning: Generative or Contrastive</article-title>
          <source>IEEE Trans. Knowl. Data Eng</source>
          <year>2021</year>
          <volume>35</volume>
          <issue>01</issue>
          <fpage>857</fpage>
          <lpage>876</lpage>
          <pub-id pub-id-type="doi">10.1109/tkde.2021.3090866</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kalyan</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Rajasekharan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sangeetha</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>AMMU: A survey of transformer-based biomedical pretrained language models</article-title>
          <source>J Biomed Inform</source>
          <year>2022</year>
          <volume>126</volume>
          <fpage>103982</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(21)00311-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2021.103982</pub-id>
          <pub-id pub-id-type="medline">34974190</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(21)00311-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Vinyals</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>QV</given-names>
            </name>
          </person-group>
          <article-title>Sequence to sequence learning with neural networks</article-title>
          <year>2014</year>
          <conf-name>NIPS'14: Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 2</conf-name>
          <conf-date>2014 December 08</conf-date>
          <conf-loc>Cambridge, MA, United States</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vinyals</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>?</given-names>
            </name>
            <name name-style="western">
              <surname>Koo</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Petrov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Grammar as a foreign language</article-title>
          <year>2015</year>
          <conf-name>NIPS'15: Proceedings of the 29th International Conference on Neural Information Processing Systems - Volume 2</conf-name>
          <conf-date>2015 December 07</conf-date>
          <conf-loc>Cambridge, MA, United States</conf-loc>
          <fpage>2773</fpage>
          <lpage>2781</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Noroozi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Favaro</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Leibe</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Matas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sebe</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised learning of visual representations by solving jigsaw puzzles</article-title>
          <source>Computer Vision – ECCV 2016. ECCV 2016. Lecture Notes in Computer Science(), vol 9910</source>
          <year>2016</year>
          <publisher-loc>Switzerland</publisher-loc>
          <publisher-name>Springer, Cham</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vinyals</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kudlur</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Order mattersquence to sequence for sets</article-title>
          <source>arXiv:1511.06391</source>
          <year>2015</year>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dui</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Interpretable time-aware and co-occurrence-aware network for medical prediction</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2021</year>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>305</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-021-01662-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-021-01662-z</pub-id>
          <pub-id pub-id-type="medline">34727940</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-021-01662-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC8561378</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bulgarelli</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Horng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>PhysioNet</article-title>
          <source>Mimic-iv</source>
          <access-date>2021-08-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://physionet org/content/mimiciv/10">https://physionet org/content/mimiciv/10</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berglund</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Elmstähl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Janzon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Larsson</surname>
              <given-names>SA</given-names>
            </name>
          </person-group>
          <article-title>The malmo diet and cancer study. Design and feasibility</article-title>
          <source>J Intern Med</source>
          <year>1993</year>
          <volume>233</volume>
          <issue>1</issue>
          <fpage>45</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2796.1993.tb00647.x</pub-id>
          <pub-id pub-id-type="medline">8429286</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gentimis</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Alnaser</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Durante</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Steele</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Predicting hospital length of stay using neural networks on mimic iii data</article-title>
          <year>2017</year>
          <conf-name>IEEE 15th Intl Conf on Dependable, Autonomic and Secure Computing, 15th Intl Conf on Pervasive Intelligence and Computing, 3rd Intl Conf on Big Data Intelligence and Computing and Cyber Science and Technology Congress(DASC/PiCom/DataCom/CyberSciTech)</conf-name>
          <conf-date>2017 November 06-10</conf-date>
          <conf-loc>Orlando, FL, USA</conf-loc>
          <fpage>1194</fpage>
          <lpage>1201</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Farajtabar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Azizan</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mott</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Orthogonal gradient descent for continual learning</article-title>
          <year>2020</year>
          <conf-name>Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics, PMLR</conf-name>
          <conf-date>2020 JAN 01</conf-date>
          <conf-loc>Palermo, Italy</conf-loc>
          <fpage>3762</fpage>
          <lpage>7373</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mirzadeh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Farajtabar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gorur</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pascanu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ghasemzadeh</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Linear mode connectivity in multitask and continual learning</article-title>
          <source>arXiv:2010.04495</source>
          <year>2020</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2010.04495</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Piao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Beit: Bert pre-training of image transformers</article-title>
          <source>arXiv:2106.08254</source>
          <year>2021</year>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>CLIP itself is a strong fine-tuner: Achieving 85.7% and 88.0% top-1 accuracy with ViT-B and ViT-L on ImageNet</article-title>
          <source>arXiv:2212.06138</source>
          <year>2022</year>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sangineto</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bi</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Sebe</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lepri</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Nadai</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Efficient training of visual transformers with small datasets</article-title>
          <source>arXiv:2106.03746</source>
          <year>2021</year>
          <lpage>30</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>Y-H</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Training vision transformers with only 2040 images</article-title>
          <year>2022</year>
          <conf-name>Computer Vision – ECCV 2022: 17th European Conference, Tel Aviv, Israel</conf-name>
          <conf-date>2022 October 23</conf-date>
          <conf-loc>Berlin, Heidelberg</conf-loc>
          <publisher-loc>Berlin</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>220</fpage>
          <lpage>237</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Luong</surname>
              <given-names>M-T</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>QV</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
          </person-group>
          <article-title>Electra: Pre-training text encoders as discriminators rather than generators</article-title>
          <source>arXiv:2003.10555</source>
          <year>2020</year>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gulati</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>C-C</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Conformer: Convolution-augmented transformer for speech recognition</article-title>
          <source>arXiv:2005.08100</source>
          <year>2020</year>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zaheer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Guruganesh</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dubey</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ainslie</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Alberti</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ontanon</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Big bird: Transformers for longer sequences</article-title>
          <year>2020</year>
          <conf-name>NIPS'20: Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>2020 December 06</conf-date>
          <conf-loc>Red Hook, NY, United States</conf-loc>
          <fpage>17283</fpage>
          <lpage>17297</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Informer: Beyond efficient transformer for long sequence time-series forecasting</article-title>
          <source>Proc. AAAI Conf. Artif. Intell</source>
          <year>2021</year>
          <volume>35</volume>
          <issue>12</issue>
          <fpage>11106</fpage>
          <lpage>11115</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schick</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schutze</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>It?s not just size that mattersmall language models are also few-shot learners</article-title>
          <source>arXiv:2009.07118</source>
          <year>2020</year>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lundström</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Järpe</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Verikas</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Detecting and exploring deviating behaviour of smart home residents</article-title>
          <source>Expert Syst. Appl</source>
          <year>2016</year>
          <volume>55</volume>
          <fpage>429</fpage>
          <lpage>440</lpage>
          <pub-id pub-id-type="doi">10.1016/j.eswa.2016.02.030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt-Thieme</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Sparse self-attention guided generative adversarial networks for time-series generation</article-title>
          <source>Int J Data Sci Anal</source>
          <year>2023</year>
          <volume>16</volume>
          <issue>4</issue>
          <fpage>1</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1007/s41060-023-00416-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pham</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Phung</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Venkatesh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Predicting healthcare trajectories from medical records: A deep learning approach</article-title>
          <source>J Biomed Inform</source>
          <year>2017</year>
          <volume>69</volume>
          <fpage>218</fpage>
          <lpage>229</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(17)30071-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2017.04.001</pub-id>
          <pub-id pub-id-type="medline">28410981</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(17)30071-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Men</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ilk</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Multi-disease prediction using LSTM recurrent neural networks</article-title>
          <source>Expert Syst. Appl</source>
          <year>2021</year>
          <volume>177</volume>
          <fpage>114905</fpage>
          <pub-id pub-id-type="doi">10.1016/j.eswa.2021.114905</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Amirahmadii</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <source>ali-amirahmadii</source>
          <access-date>2025-05-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/ali-amirahmadii/">https://github.com/ali-amirahmadii/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="web">
          <source>PhysioNet repository</source>
          <access-date>2025-05-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://physionet.org/content/mimiciv/">https://physionet.org/content/mimiciv/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="web">
          <source>Malmo population-based Cohorts joint database</source>
          <access-date>2025-05-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.malmo-kohorter.lu.se/malmo-cohorts">https://www.malmo-kohorter.lu.se/malmo-cohorts</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
