<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e68830</article-id><article-id pub-id-type="doi">10.2196/68830</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Autoencoder-Based Representation Learning for Similar Patients Retrieval From Electronic Health Records: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Deyi</given-names></name><degrees>BS, MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shukla</surname><given-names>Aditi</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chandaka</surname><given-names>Sravani</given-names></name><degrees>BS, MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Taylor</surname><given-names>Bradley</given-names></name><degrees>BS, MBA</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Jie</given-names></name><degrees>BS, MS, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Liu</surname><given-names>Mei</given-names></name><degrees>BS, MS, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Health Outcomes &#x0026; Biomedical Informatics, University of Florida</institution><addr-line>1889 Museum Rd, 7th Floor, Suite 7000, Room 7012</addr-line><addr-line>Gainesville</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Mathematics, College of Arts and Sciences, University of Pennsylvania</institution><addr-line>Philadelphia</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Population Health, University of Kansas Medical Center</institution><addr-line>Kansas City</addr-line><addr-line>KS</addr-line><country>United States</country></aff><aff id="aff4"><institution>CTSI Center for Biomedical Informatics, Medical College of Wisconsin</institution><addr-line>Milwaukee</addr-line><addr-line>WI</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Chen</surname><given-names>Qingyu</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chan</surname><given-names>Tsai Hor</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Trivedi</surname><given-names>Yogesh</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chu</surname><given-names>Yuanchia</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mei Liu, BS, MS, PhD, Department of Health Outcomes &#x0026; Biomedical Informatics, University of Florida, 1889 Museum Rd, 7th Floor, Suite 7000, Room 7012, Gainesville, FL, 32611, United States, 1 352-627-9143; <email>mei.liu@ufl.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>24</day><month>7</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e68830</elocation-id><history><date date-type="received"><day>15</day><month>11</month><year>2024</year></date><date date-type="rev-recd"><day>23</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>04</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9;Deyi Li, Aditi Shukla, Sravani Chandaka, Bradley Taylor, Jie Xu, Mei Liu. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 24.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e68830"/><abstract><sec><title>Background</title><p>By analyzing electronic health record snapshots of similar patients, physicians can proactively predict disease onsets, customize treatment plans, and anticipate patient-specific trajectories. However, the modeling of electronic health record data is inherently challenging due to its high dimensionality, mixed feature types, noise, bias, and sparsity. Patient representation learning using autoencoders (AEs) presents promising opportunities to address these challenges. A critical question remains: how do different AE designs and distance measures impact the quality of retrieved similar patient cohorts?</p></sec><sec><title>Objective</title><p>This study aims to evaluate the performance of 5 common AE variants&#x2014;vanilla autoencoder, denoising autoencoder, contractive autoencoder, sparse autoencoder, and robust autoencoder&#x2014;in retrieving similar patients. Additionally, it investigates the impact of different distance measures and hyperparameter configurations on model performance.</p></sec><sec sec-type="methods"><title>Methods</title><p>We tested the 5 AE variants on 2 real-world datasets&#x2014;the University of Kansas Medical Center (n=13,752) and the Medical College of Wisconsin (n=9568)&#x2014;across 168 different hyperparameter configurations. To retrieve similar patients based on the AE-produced latent representations, we applied k-nearest neighbors (k-NN) using Euclidean and Mahalanobis distances. Two prediction targets were evaluated: acute kidney injury onset and postdischarge 1-year mortality.</p></sec><sec sec-type="results"><title>Results</title><p>Our findings demonstrate that (1) denoising autoencoders outperformed other AE variants when paired with Euclidean distance (<italic>P</italic>&#x003C;.001), followed by vanilla autoencoders and contractive autoencoders; (2) learning rates significantly influenced the performance of AE variants; and (3) Mahalanobis distance-based k-NN frequently outperformed Euclidean distance-based k-NN when applied to latent representations. However, whether AE models are superior in transforming raw data into latent representations, compared with applying Mahalanobis distance-based k-NN directly to raw data, appears to be data-dependent.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study provides a comprehensive analysis of the performance of different AE variants in retrieving similar patients and evaluates the impact of various hyperparameter configurations on model performance. The findings lay the groundwork for future development of AE-based patient similarity estimation and personalized medicine.</p></sec></abstract><kwd-group><kwd>machine learning</kwd><kwd>decision support for health professionals</kwd><kwd>methods and instruments in medical informatics</kwd><kwd>electronic health records</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Diseases vary in complexity, posing substantial challenges in diagnosis, treatment, and prognosis&#x2014;even when cases appear clinically similar [<xref ref-type="bibr" rid="ref1">1</xref>]. This heterogeneity is particularly prominent in complex disorders like autoimmune diseases [<xref ref-type="bibr" rid="ref2">2</xref>], Parkinson disease [<xref ref-type="bibr" rid="ref3">3</xref>], and cardiovascular diseases [<xref ref-type="bibr" rid="ref4">4</xref>], where underlying causes often result from a confluence of genetic, environmental, and lifestyle factors [<xref ref-type="bibr" rid="ref5">5</xref>]. As these complexities become more evident, the rapid adoption of electronic health record (EHR) systems has bolstered the potential for personalized medicine to enhance patient care. Personalized medicine focuses on tailoring treatments and predicting patient outcomes by analyzing data from patients with similar characteristics [<xref ref-type="bibr" rid="ref6">6</xref>]. By assessing EHR snapshots of comparable patients&#x2014;including prescriptions, procedures, vital signs, lab results, and clinical outcomes&#x2014;physicians can proactively predict disease onsets, customize treatment plans, and anticipate patient-specific trajectories [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Additionally, predictive models that leverage data from similar patients tend to be more accurate, as they capture localized data patterns that might be obscured in aggregated data [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>Retrieving a high-quality set of similar patients is central to personalized medicine, directly impacting both evidence-based decision-making and the accuracy of personalized predictive models. However, EHR data are inherently challenging to model due to high dimensionality, mixed feature types, noise, bias, and sparsity, complicating the effective retrieval of similar patients [<xref ref-type="bibr" rid="ref10">10</xref>]. For instance, applying traditional Euclidean distance-based k-nearest neighbors (k-NN) directly to EHR data may be problematic due to high dimensionality and mixed data types. To address these challenges, various similar patient retrieval algorithms have been proposed, incorporating advanced feature engineering to handle mixed features and reduce dimensionality [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>Patient representation learning offers new avenues for overcoming these obstacles, with autoencoders (AEs) being one of the most important and widely used methods in this area [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. AEs compress input data into a lower-dimensional latent space, known as a latent representation, and reconstruct it back to its original form, facilitating effective auto-feature engineering and patient representation [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. AEs are particularly useful for encoding nonlinear relationships within EHR, and capturing complex structures in clinical data [<xref ref-type="bibr" rid="ref20">20</xref>]. As AE applications to EHR increase, their use is becoming increasingly diverse [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. For instance, Chowdhury et al [<xref ref-type="bibr" rid="ref16">16</xref>] designed a mixed pooling multi-view attention AE to learn representations that encapsulate a holistic view of patient medical profiles. Beaulieu-Jones et al [<xref ref-type="bibr" rid="ref15">15</xref>] applied a vanilla AE with a modified binary cross-entropy loss to impute missing data in EHR, and Lee et al [<xref ref-type="bibr" rid="ref14">14</xref>] used a dual adversarial AE to generate sequential EHR data.</p><p>In personalized medicine, AEs are increasingly applied to enhance similar patient retrieval [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Generally, these studies use AEs to generate efficient patient representations from EHR data, with similarity among patients assessed using distance measures such as Euclidean and Mahalanobis distances. For example, Jo et al [<xref ref-type="bibr" rid="ref21">21</xref>] used a supervised AE to incorporate disease labels into latent representations and calculated patient similarity in the latent space using the Euclidean distance. Miotto et al [<xref ref-type="bibr" rid="ref22">22</xref>] introduced the &#x201C;Deep Patient&#x201D; framework with a 3-layer stack of denoising autoencoders (DAEs) to generate latent patient representations from EHR data, which was then used to estimate patient similarity. Landi et al [<xref ref-type="bibr" rid="ref23">23</xref>] used a convolutional AE to transform patient trajectories into low-dimensional latent vectors and achieved patient risk stratification by patient similarity. These studies underscore the potential of AEs to drive advances in personalized medicine.</p><p>Despite the promising results of applying AEs to EHR data, a critical question remains unanswered: how do different AE designs impact performance in similar patient retrieval tasks? Existing studies have not clearly justified their choices of specific AE designs. Specifically, AE designs encompass 2 key aspects. The first aspect is the choice of the base AE model, as different AE variants may perform differently due to their distinct design focuses and the unique characteristics of EHR data. When these base AE models are integrated into more complex architectures (eg, &#x201C;Deep Patient&#x201D; [<xref ref-type="bibr" rid="ref22">22</xref>]), their behavior may also vary. Therefore, gaining deeper insight into the performance of different base AE models on EHR data is valuable. The second aspect is hyperparameter tuning, as AEs are known to be highly sensitive to hyperparameters, including learning rate, latent dimensionality, and optimization techniques [<xref ref-type="bibr" rid="ref24">24</xref>]. Therefore, understanding how different hyperparameters impact AE performance on EHR data is also important.</p><p>In this study, we used 2 real-world EHR datasets from the University of Kansas Medical Center (KUMC) and the Medical College of Wisconsin (MCW), covering the period from January 1, 2016 to December 31, 2016, to evaluate the performance of 5 widely used AE variants for retrieving similar patients: vanilla AE (AE) [<xref ref-type="bibr" rid="ref18">18</xref>], DAE [<xref ref-type="bibr" rid="ref25">25</xref>], contractive autoencoder (CAE) [<xref ref-type="bibr" rid="ref26">26</xref>], sparse autoencoder (SAE) [<xref ref-type="bibr" rid="ref27">27</xref>], and robust autoencoder (RAE) [<xref ref-type="bibr" rid="ref28">28</xref>]. Vanilla AE is the most basic form of autoencoder, making it efficient to train and use. DAE and RAE can address the significant noise in EHR data, while CAE and SAE use different mechanisms to learn more robust latent representations for EHR data with complex distributions [<xref ref-type="bibr" rid="ref29">29</xref>]. Additionally, we investigated the impact of 2 distance measures, Euclidean and Mahalanobis, on similar patient retrieval when paired with these AE variants. To comprehensively evaluate model performance, we tested them within a standard k-NN classification framework for 2 binary clinical outcomes in hospitalized patients: acute kidney injury (AKI) onset and 1-year mortality postdischarge, representing short-term disease and long-term survival risk classification scenarios. AKI, a life-threatening and heterogeneous condition prevalent among hospitalized patients, is particularly suited to a personalized approach. Finally, we explored how different hyperparameter configurations affect AE performance in retrieving similar patients for outcome prediction. This study provides key insights into AE optimization for personalized medicine applications, informing future advancements in EHR-driven patient care.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Comparison Framework Overview</title><p>This study aims to evaluate the effects of various AE variants, hyperparameter settings, and distance measures on the performance of similar patient retrieval. The 5 AE variants investigated were vanilla AE (AE), DAE, CAE, SAE, and RAE. Each AE was trained in an unsupervised manner on the training dataset, after which both the training and test datasets were transformed into latent representations using the trained AEs.</p><p>Performance was evaluated using a standard k-NN classification framework with neighborhood sizes of 5, 10, 15, and 20. For similar patient retrieval, Euclidean and Mahalanobis distances were applied to the latent representations to identify similar patients based on a specified neighborhood size for each test patient. Labels were assigned to each test patient through majority voting, and these assigned labels were then compared with the ground truth to assess model accuracy. Furthermore, we analyzed the influence of different hyperparameter configurations on AE model performance in retrieving similar patients, focusing on Euclidean distance as the patient similarity measure.</p></sec><sec id="s2-2"><title>Data Source and Processing</title><p>Our primary dataset consisted of inpatient data extracted from KUMC, covering admissions from January 1, 2016, to December 31, 2016. To assess the generalizability of our findings, we extracted an external validation dataset from MCW for the same period.</p><p>Both datasets were processed using the same protocol. The inclusion criteria were as follows: (1) older than 18 years, (2) baseline serum creatinine (SCr) &#x003C;3.5 mg/dL, and (3) AKI onset occurring at least 72 hours postadmission to focus only on hospital-acquired AKI [<xref ref-type="bibr" rid="ref30">30</xref>]. AKI was defined using the SCr criteria described in the &#x201C;Kidney Disease: Improving Global Outcomes&#x201D; clinical practice guidelines [<xref ref-type="bibr" rid="ref31">31</xref>]. For patients with multiple admissions, only the first encounter was retained. The study focused on 3 types of in-hospital clinical features: medications, procedures, and lab test results. The data observation window for these features extended from 48 hours before the prediction point up to the prediction point. For patients with AKI, the prediction point was set at 24 hours before AKI onset, while for patients without AKI, it was set at 24 hours before the last SCr measurement [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>Medications were represented by the maximum dosages recorded within the data observation window, procedures were encoded as binary values indicating whether a procedure was performed during the observation window, and lab test results were recorded as the most recent values within the observation window. Medications and procedures present in less than 1% of patients were excluded from the analysis. Lab tests with a missing rate over 30% were also discarded, with the remaining missing lab values imputed using the multiple imputation by chained equations method [<xref ref-type="bibr" rid="ref32">32</xref>]. Outliers were replaced using the Winsorizing method with a 1% threshold [<xref ref-type="bibr" rid="ref33">33</xref>], and min-max normalization was applied to scale values between 0 and 1.</p><p>In addition to AKI onset, 1-year mortality after discharge was also included as a prediction target, providing a comprehensive evaluation of the retrieved similar patient cohorts in terms of both short-term (AKI onset) and long-term (1-year mortality) clinical outcomes.</p></sec><sec id="s2-3"><title>AE Variants</title><p>The vanilla AE (<xref ref-type="fig" rid="figure1">Figure 1A</xref>) is the most basic form in the autoencoder family. Its architecture is a symmetric feedforward neural network structure, though this symmetry does not necessarily apply to the weights, biases, or activation functions. It has 2 main components: an encoder and a decoder. The encoder encodes the input into a latent representation, while the decoder reconstructs the original data from this hidden representation. For an AE with a single hidden layer, the input data undergo the following transformations:</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mi>Z</mml:mi><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x03D5;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:mi>b</mml:mi><mml:mo>)</mml:mo></mml:math></disp-formula><disp-formula id="equWL2"><mml:math id="eqn2"><mml:msup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>g</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>Z</mml:mi></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x03D5;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:msup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msup><mml:mi>Z</mml:mi><mml:mo>+</mml:mo><mml:mi>b</mml:mi><mml:mi>`</mml:mi><mml:mo>)</mml:mo></mml:math></disp-formula><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the 5 autoencoder (AE) variant designs with different loss functions.<bold> <named-content content-type="#000000">(</named-content><named-content content-type="#000000">A</named-content><named-content content-type="#000000">)</named-content> </bold><named-content content-type="#000000">Vanilla AE;</named-content><bold> <named-content content-type="#000000">(</named-content><named-content content-type="#000000">B</named-content><named-content content-type="#000000">)</named-content> </bold><named-content content-type="#000000">Contractive AE;</named-content><bold> <named-content content-type="#000000">(</named-content><named-content content-type="#000000">C</named-content><named-content content-type="#000000">) </named-content></bold><named-content content-type="#000000">Denoising AE;</named-content><bold> <named-content content-type="#000000">(</named-content><named-content content-type="#000000">D</named-content><named-content content-type="#000000">) </named-content></bold><named-content content-type="#000000">Sparse AE; and</named-content><bold> <named-content content-type="#000000">(</named-content><named-content content-type="#000000"><bold>E</bold></named-content><named-content content-type="#000000">) </named-content></bold>Robust AE. MCC: maximum correntropy criterion; MSE: mean squared error.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68830_fig01.png"/></fig><p>Here, <inline-formula><mml:math id="ieqn1"><mml:mi>X</mml:mi></mml:math></inline-formula> denotes the input data, and <inline-formula><mml:math id="ieqn2"><mml:mi>X</mml:mi><mml:mi>`</mml:mi></mml:math></inline-formula> represents the reconstructed data. <inline-formula><mml:math id="ieqn3"><mml:mi>Z</mml:mi></mml:math></inline-formula> is the output of the latent representation produced by the encoder. <inline-formula><mml:math id="ieqn4"><mml:mi>W</mml:mi></mml:math></inline-formula>, <inline-formula><mml:math id="ieqn5"><mml:mi>W</mml:mi><mml:mi>`</mml:mi></mml:math></inline-formula>, <inline-formula><mml:math id="ieqn6"><mml:mi>b</mml:mi></mml:math></inline-formula>, and <inline-formula><mml:math id="ieqn7"><mml:mi>b</mml:mi><mml:mi>`</mml:mi></mml:math></inline-formula> are the weights and biases of the encoder and decoder, while <inline-formula><mml:math id="ieqn8"><mml:msub><mml:mrow><mml:mi>&#x03D5;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn9"><mml:msub><mml:mrow><mml:mi>&#x03D5;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> are activation functions. To quantify reconstruction accuracy, the mean squared error (MSE) loss was used, which measures the difference between the original input data and the reconstructed data, as follows:</p><disp-formula id="equWL3"><mml:math id="eqn3"><mml:msub><mml:mrow><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>A</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>X</mml:mi><mml:mi>`</mml:mi><mml:mo>)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi><mml:mi>`</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mrow></mml:math></disp-formula><p>Here, <inline-formula><mml:math id="ieqn10"><mml:mi>n</mml:mi></mml:math></inline-formula> denotes the number of input samples, while <inline-formula><mml:math id="ieqn11"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn12"><mml:msub><mml:mrow><mml:mi>x</mml:mi><mml:mi>`</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represent the <inline-formula><mml:math id="ieqn13"><mml:mi>i</mml:mi></mml:math></inline-formula>-th sample of <inline-formula><mml:math id="ieqn14"><mml:mi>X</mml:mi></mml:math></inline-formula> and the <inline-formula><mml:math id="ieqn15"><mml:mi>i</mml:mi></mml:math></inline-formula>-th sample of <inline-formula><mml:math id="ieqn16"><mml:mi>X</mml:mi><mml:mi>`</mml:mi></mml:math></inline-formula>, respectively. This loss function allows AE to learn a compressed representation of the data by minimizing the reconstruction error.</p><p>The DAE (<xref ref-type="fig" rid="figure1">Figure 1C</xref>) enhances model robustness by introducing noise into the input data. Specifically, data with noise (<inline-formula><mml:math id="ieqn17"><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> in <xref ref-type="fig" rid="figure1">Figure 1C</xref>) are used as input, and the reconstruction error is calculated between the original noise-free data (<inline-formula><mml:math id="ieqn18"><mml:mi>X</mml:mi></mml:math></inline-formula> in <xref ref-type="fig" rid="figure1">Figure 1C</xref>) and the reconstructed data. In this way, the model learns to reconstruct the input by eliminating any noise present. In this study, we used swap noise, where each value in the training data may be replaced with a random value from the same column with a certain probability. We selected swap noise for fairness purposes as it has been shown to be the most effective noise type for tabular data [<xref ref-type="bibr" rid="ref34">34</xref>]. Other commonly used noise types include Gaussian noise and masking noise [<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>The CAE (<xref ref-type="fig" rid="figure1">Figure 1B</xref>) enhances model robustness by reducing the encoder&#x2019;s sensitivity to minor perturbations in the input, a typical vulnerability in AEs where small variations can lead to significant differences in latent representations. The CAE addresses this issue by introducing an additional penalty term, the Frobenius norm of the encoder, to the loss function. This term, which is the <inline-formula><mml:math id="ieqn19"><mml:mi>L</mml:mi><mml:mn>2</mml:mn></mml:math></inline-formula>-norm of the Jacobian matrix of the hidden layer, makes the encoder output more stable against small input variations. The Frobenius norm of the encoder and the CAE loss function are shown as follows:</p><disp-formula id="equWL4"><mml:math id="eqn4"><mml:msubsup><mml:mrow><mml:mfenced open="&#x2016;" close="&#x2016;" separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>Z</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mo>&#x2202;</mml:mo><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>k</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:msubsup><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mo>&#x2202;</mml:mo><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>j</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula><disp-formula id="equWL5"><mml:math id="eqn5"><mml:msub><mml:mrow><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>A</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfenced><mml:mo>+</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x03BB;</mml:mi><mml:mo>&#x2219;</mml:mo><mml:mfenced open="&#x2016;" close="&#x2016;" separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>Z</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></disp-formula><p>Here, <inline-formula><mml:math id="ieqn20"><mml:mi>m</mml:mi></mml:math></inline-formula> denotes the number of input features, and <inline-formula><mml:math id="ieqn21"><mml:mi>l</mml:mi></mml:math></inline-formula> represents the length of the latent representations. <inline-formula><mml:math id="ieqn22"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula> controls the strength of the additional penalty term, which aims to restrict the rate of change in the encoder output relative to changes in the input. When the input undergoes minor variations, the CAE encoder output remains relatively stable, enhancing the model&#x2019;s robustness against noise and small perturbations.</p><p>The SAE (<xref ref-type="fig" rid="figure1">Figure 1D</xref>) encourages the model to learn efficient representations by enforcing sparsity in latent representations. By adding a Kullback-Leibler (KL) divergence penalty between a Bernoulli distribution and the distribution of latent layer outputs to the loss function, the SAE limits the number of active neurons (whose outputs are significantly nonzero) in the latent layer. This helps SAE to capture key information from the input using a limited number of active neurons in the latent layer, preventing it from simply copying the input to the output and enhancing the model&#x2019;s ability to capture the inherent structure of the input data. The additional penalty term and the loss function of the SAE are as follows:</p><disp-formula id="equWL6"><mml:math id="eqn6"><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi><mml:mi>L</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>&#x03C1;</mml:mi><mml:mo>&#x2228;</mml:mo><mml:mfenced open="|" separators="|"><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x03C1;</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:mi>&#x03C1;</mml:mi><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mfrac><mml:mrow><mml:mi>&#x03C1;</mml:mi></mml:mrow><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x03C1;</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow></mml:mfrac><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C1;</mml:mi><mml:mo>)</mml:mo><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mfrac><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C1;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mover accent="true"><mml:mrow><mml:mi>&#x03C1;</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>)</mml:mo></mml:mrow></mml:mfrac></mml:math></disp-formula><disp-formula id="equWL7"><mml:math id="eqn7"><mml:msub><mml:mrow><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfenced><mml:mo>+</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:mo>&#x2219;</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi><mml:mi>L</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>&#x03C1;</mml:mi><mml:mo>&#x2228;</mml:mo><mml:mfenced open="|" separators="|"><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x03C1;</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow></mml:mfenced></mml:math></disp-formula><p>Here, <inline-formula><mml:math id="ieqn23"><mml:mi>&#x03C1;</mml:mi></mml:math></inline-formula> denotes the mean of the Bernoulli distribution. <inline-formula><mml:math id="ieqn24"><mml:mover accent="true"><mml:mrow><mml:mi>&#x03C1;</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula> denotes the mean of the distribution of latent representations over the training data. <inline-formula><mml:math id="ieqn25"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula> controls the strength of the additional penalty term.</p><p>The RAE (<xref ref-type="fig" rid="figure1">Figure 1E</xref>) improves noise tolerance by using the maximum correntropy criterion (MCC) instead of MSE for reconstruction error, making it less sensitive to outliers [<xref ref-type="bibr" rid="ref36">36</xref>]. The intuition behind the MCC-based reconstruction error is that as the distance between <inline-formula><mml:math id="ieqn26"><mml:mi>X</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn27"><mml:mi>X</mml:mi><mml:mi>`</mml:mi></mml:math></inline-formula> increases, the corresponding measure transitions from the <inline-formula><mml:math id="ieqn28"><mml:mi>L</mml:mi><mml:mn>2</mml:mn></mml:math></inline-formula> norm to the <inline-formula><mml:math id="ieqn29"><mml:mi>L</mml:mi><mml:mn>1</mml:mn></mml:math></inline-formula> norm, and eventually to the zero norm when <inline-formula><mml:math id="ieqn30"><mml:mi>X</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn31"><mml:mi>X</mml:mi><mml:mi>`</mml:mi></mml:math></inline-formula> are far apart. The RAE also includes a sparsity penalty term similar to that of the SAE, along with an additional weight decay term to prevent overfitting. The MCC-based reconstruction error, the weight decay term, and the final loss function of the RAE are shown as follows:</p><disp-formula id="equWL8"><mml:math id="eqn8"><mml:mi>M</mml:mi><mml:mi>C</mml:mi><mml:mi>C</mml:mi><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>X</mml:mi><mml:mi>`</mml:mi><mml:mo>)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msqrt><mml:mn>2</mml:mn><mml:mi>&#x03C0;</mml:mi></mml:msqrt><mml:mi>&#x03C3;</mml:mi></mml:mrow></mml:mfrac><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mo>(</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mrow><mml:mi>x</mml:mi><mml:mi>`</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>j</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:msup><mml:mrow><mml:mi>&#x03C3;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula><disp-formula id="equWL9"><mml:math id="eqn9"><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>w</mml:mi><mml:mi>e</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>L</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>L</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mrow><mml:msup><mml:mrow><mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>L</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula><disp-formula id="equWL10"><mml:math id="eqn10"><mml:msub><mml:mrow><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>R</mml:mi><mml:mi>A</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>M</mml:mi><mml:mi>C</mml:mi><mml:mi>C</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfenced><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>&#x03B2;</mml:mi><mml:mo>&#x2219;</mml:mo><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi><mml:mi>L</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>&#x03C1;</mml:mi><mml:mo>&#x2228;</mml:mo><mml:mfenced open="|" separators="|"><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x03C1;</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow></mml:mfenced><mml:mo>+</mml:mo><mml:mi>&#x03BB;</mml:mi><mml:mo>&#x2219;</mml:mo><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>w</mml:mi><mml:mi>e</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:mfenced></mml:math></disp-formula><p>Here, <inline-formula><mml:math id="ieqn32"><mml:mi>&#x03C3;</mml:mi></mml:math></inline-formula> denotes the variance of the Gaussian distribution. <inline-formula><mml:math id="ieqn33"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>L</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula> denotes an element in the weight matrix of the <inline-formula><mml:math id="ieqn34"><mml:mi>L</mml:mi></mml:math></inline-formula>-th layer. <inline-formula><mml:math id="ieqn35"><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>L</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the number of neurons in the <inline-formula><mml:math id="ieqn36"><mml:mi>L</mml:mi></mml:math></inline-formula>-th layer. <inline-formula><mml:math id="ieqn37"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn38"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula> control the strength of the 2 penalty terms, respectively.</p></sec><sec id="s2-4"><title>Patient Similarity Measures</title><p>To identify similar patients, we applied 2 distance measures&#x2014;Euclidean and Mahalanobis distances&#x2014;to the latent representations generated by each AE variant. For each patient in the test dataset, we used these distance measures to find a cohort of similar patients from the training dataset. The Euclidean distance on the latent representations is calculated as follows:</p><disp-formula id="equWL11"><mml:math id="eqn11"><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>E</mml:mi><mml:mi>u</mml:mi><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo><mml:mo>=</mml:mo><mml:msqrt><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mrow></mml:msqrt></mml:math></disp-formula><p>Here, <inline-formula><mml:math id="ieqn39"><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the <inline-formula><mml:math id="ieqn40"><mml:mi>i</mml:mi></mml:math></inline-formula>-th patient in the training set and <inline-formula><mml:math id="ieqn41"><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the <inline-formula><mml:math id="ieqn42"><mml:mi>j</mml:mi></mml:math></inline-formula>-th patient in the test set. <inline-formula><mml:math id="ieqn43"><mml:mi>l</mml:mi></mml:math></inline-formula> denotes the length of the latent representations.</p><p>The Mahalanobis distance can be viewed as a Euclidean distance after applying a linear transformation to the feature space, defined by <inline-formula><mml:math id="ieqn44"><mml:mi>L</mml:mi></mml:math></inline-formula>:</p><disp-formula id="equWL12"><mml:math id="eqn12"><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>h</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo><mml:mo>=</mml:mo><mml:msqrt><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mi>L</mml:mi><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>L</mml:mi><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mo>(</mml:mo><mml:mi>L</mml:mi><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>L</mml:mi><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:msqrt></mml:math></disp-formula><p>In this study, we used 3 different algorithms to estimate the Mahalanobis distance on the latent representations: large margin nearest neighbor (LMNN) [<xref ref-type="bibr" rid="ref37">37</xref>], Neighborhood Components Analysis (NCA) [<xref ref-type="bibr" rid="ref38">38</xref>], and Metric Learning for Kernel Regression (MLKR) [<xref ref-type="bibr" rid="ref39">39</xref>].</p><list list-type="bullet"><list-item><p>LMNN learns a Mahalanobis distance within the standard k-NN classification framework, aiming to bring the nearest <italic>k</italic> neighbors from the same class closer while ensuring that examples from different classes are separated by a large margin.</p></list-item><list-item><p>NCA enhances the accuracy of nearest neighbor classification compared with the traditional Euclidean distance by directly maximizing a stochastic version of the leave-one-out k<italic>-</italic>NN score on the training set.</p></list-item><list-item><p>MLKR learns a Mahalanobis distance by directly minimizing the leave-one-out regression error. This algorithm can also be viewed as a supervised extension of principal component analysis (PCA), making it suitable for dimensionality reduction and visualization of high-dimensional data.</p></list-item></list></sec><sec id="s2-5"><title>Experimental Design</title><p>The workflow of the study is presented in <xref ref-type="fig" rid="figure2">Figure 2</xref>. Each AE variant in this study consisted of a 3-layer structure: an input layer, a hidden layer, and an output layer. Previous research suggests that additional hidden layers in AEs do not necessarily lead to improved downstream task performance [<xref ref-type="bibr" rid="ref40">40</xref>]. All AE variants were implemented in PyTorch (version 2.4.0; Meta AI), and trained on 2 NVIDIA GeForce RTX 2080 Ti GPUs, each with 10.7 GB of RAM. For each of the 5 AE variants, we performed an exhaustive grid search to explore all possible combinations of learning rates, optimizers, latent dimensions, and activation functions, resulting in 168 different hyperparameter configurations per AE variant. Details of the hyperparameter space are listed in <xref ref-type="table" rid="table1">Table 1</xref>. The Mahalanobis distance was estimated using the metric-learn library [<xref ref-type="bibr" rid="ref41">41</xref>].</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Workflow of the study. After the electronic health record data were transformed into latent representations using 5-fold cross-validation, the latent representations were used for 3 downstream evaluations: Euclidean-distance-based similar patient retrieval, Mahalanobis-distance-based similar patient retrieval, and AE hyperparameter analysis. AE: autoencoder; CAE: contractive autoencoder; DAE: denoising autoencoder; KUMC: University of Kansas Medical Center; LMNN: large margin nearest neighbor; MCW: Medical College of Wisconsin; MLKR: metric learning for kernel regression; NCA: neighborhood components analysis; RAE: robust autoencoder; SAE: sparse autoencoder.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68830_fig02.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Hyperparameter configuration space. The combinations of different values for the 4 studied hyperparameters resulted in 168 different hyperparameter configurations.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Hyperparameter</td><td align="left" valign="bottom">Range</td></tr></thead><tbody><tr><td align="left" valign="top">Learning rate</td><td align="left" valign="top">1E-5, 1E-4, 1E-3, and 1E-2</td></tr><tr><td align="left" valign="top">Optimizer</td><td align="left" valign="top">Adam, Adamax, and RMSprop</td></tr><tr><td align="left" valign="top">Latent dimension : input dimension</td><td align="left" valign="top">0.02, 0.05, 0.10, 0.15, 0.30, 0.50, and 0.75</td></tr><tr><td align="left" valign="top">Activation functions</td><td align="left" valign="top">Sigmoid and rectified linear unit (ReLU)</td></tr></tbody></table></table-wrap><p>To assess the generalizability of our findings, we applied the following steps in parallel to both the KUMC and MCW datasets. Before evaluating the performance of different AE variants on the 4 hyperparameters of interest mentioned above, we fixed each AE variant&#x2019;s unique hyperparameters (eg, the sparsity term penalty strength <inline-formula><mml:math id="ieqn45"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula> in the SAE). Otherwise, combining each model&#x2019;s unique hyperparameters with the selected 4 hyperparameters would make the computation infeasible. To determine the optimal unique hyperparameter settings, we fine-tuned each AE variant using a standardized setup (learning rate=1E-3, optimizer=Adam, latent dimension: input dimension=0.15, and activation=Sigmoid) and fixed these optimal hyperparameter values in the subsequent experiments. To account for variations arising from the random initialization of neural network weights and data splitting, each of the 168 studied hyperparameter configurations was trained and evaluated using 5-fold cross-validation, with the average performance of the Euclidean distance-based k-NN on the AE-produced latent representations reported for the 2 prediction targets&#x2014;AKI onset and 1-year mortality&#x2014;across the 5 runs. The performance of the Euclidean distance-based k-NN on both the raw data and the data transformed by PCA, retaining 99% of the variance, served as baseline performance. We used  <italic>F</italic><sub>1</sub>-scores, area under the precision-recall curve (AUPRC), and area under the receiver operating characteristic curve (AUROC) as evaluation metrics. The k-NN model was evaluated with neighborhood sizes of 5, 10, 15, and 20, respectively, considering that the size of the retrieved similar patient cohort often varies based on different clinical needs (eg, the varying complexity of different diseases). We trained the models on each of the 5 data splits for up to 2000 epochs using an early stopping mechanism, meaning that training was stopped if the validation loss did not improve for more than 5 consecutive epochs.</p><p>We then evaluated the performance of using Mahalanobis distance as the distance measure for k-NN on latent representations, focusing on two key questions: (1) Does k-NN with Mahalanobis distance on latent representations outperform k-NN with Euclidean distance? (assessing the effectiveness of Mahalanobis distance), and (2) Is applying Mahalanobis distance-based k-NN on latent representations more effective than applying it directly to raw data? (assessing the effectiveness of using AEs for EHR data transformation).</p><p>We first selected the best-performing hyperparameter configuration for each AE variant that achieved the highest <italic>F</italic><sub>1</sub>-score using Euclidean distance-based k-NN with a neighborhood size of 5 to transform the raw data into latent representations. Given that we varied latent dimensions during training and considering the potentially significant impact of different latent dimensions on Mahalanobis distance-based k-NN performance, which could obscure the actual characteristics of each AE variant, we also selected the best-performing hyperparameter configuration for each AE variant with latent-to-input dimension ratio fixed at 0.5 to transform the raw data into latent representations. Due to the significant computational cost of estimating the Mahalanobis distances, we randomly sampled 50% of the AE-transformed training and test datasets from each of the 5-fold data splits for evaluation on the KUMC dataset and only evaluated the performance with a neighborhood size of 5. Fixed random seeds were used to ensure the sampled data remained consistent across all AE variant evaluations. For the MCW dataset, we randomly sampled 70% of the data following the same procedure to ensure a comparable sample size to the sampled KUMC dataset.</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>One-tailed paired <italic>t</italic> test was used to assess whether one AE variant significantly outperformed the other with Euclidean-based k-NN across the 168 different hyperparameter configurations, with <italic>P</italic>&#x003C;.01 considered statistically significant. Considering that in actual practice, neural network models are often fine-tuned to achieve optimal or near-optimal performance, we used an error bar plot to compare the average performance of the top 5 hyperparameter configurations for each AE variant, with Euclidean distance-based k-NN evaluated at neighborhood sizes of 5, 10, 15 and 20. This represents the upper performance bound of each AE variant in retrieving similar patients. We used box plots to visualize the impact of hyperparameter configurations on model performance. Each box plot shows the performance with Euclidean distance, where one hyperparameter of interest was fixed at a specific value while all other hyperparameters varied for each AE model.</p><p>To assess the generalizability of our findings from the KUMC dataset, we applied Spearman rank correlation to evaluate the relationship between model performance on the KUMC dataset and that on the MCW dataset. The Spearman rho (<inline-formula><mml:math id="ieqn46"><mml:mi>&#x03C1;</mml:mi></mml:math></inline-formula>) value was used to measure the strength and direction of the monotonic relationship between performances on the 2 datasets. A higher <inline-formula><mml:math id="ieqn47"><mml:mi>&#x03C1;</mml:mi></mml:math></inline-formula> indicates a stronger correlation, suggesting good generalizability.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>All data were deidentified according to the &#x201C;Safe Harbor&#x201D; criteria outlined in the Health Insurance Portability and Accountability Act. The study was determined to be nonhuman participants research by the University of Florida Institutional Review Board, as it involved only pre-existing, deidentified patient records. The data access request was approved by the Greater Plains Collaborative Data Request Oversight Committee. This study was determined by the institutional review boards of the University of Florida, University of Pittsburgh Medical Center, and University of Missouri as nonhuman participant research because it only involved the collection of existing and deidentified patient medical data. Data use agreements have been executed with both the Greater Plains Collaborative and the University of Pittsburgh.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Study Population</title><p>The final KUMC dataset encompassed 13,752 unique patients, while the MCW dataset encompassed 9568 patients. The AKI onset rates for the 2 datasets were 11.90% and 9.03%, respectively, and the 1-year mortality rates were 12.65% and 15.51%, respectively. The KUMC dataset contained 579 features, including 277 medications, 288 procedures, and 14 lab tests, while the MCW dataset contained 654 features, including 328 medications, 312 procedures, and 14 lab tests. The details of the 2 datasets are presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Statistics of the 2 datasets used in the study.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">KUMC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">MCW<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Cohort size</td><td align="left" valign="top">13,752</td><td align="left" valign="top">9568</td></tr><tr><td align="left" valign="top">Time window</td><td align="left" valign="top">January 1, 2016 to December 31, 2016</td><td align="left" valign="top">January 1, 2016 to December 31, 2016</td></tr><tr><td align="left" valign="top">AKI rates, n (%)</td><td align="left" valign="top">1636 (11.90)</td><td align="left" valign="top">890 (9.30)</td></tr><tr><td align="left" valign="top">1-year mortality rates, n (%)</td><td align="left" valign="top">1736 (12.65)</td><td align="left" valign="top">1484 (15.51)</td></tr><tr><td align="left" valign="top">Age (years), median (IQR)</td><td align="left" valign="top">61 (48-71)</td><td align="left" valign="top">61 (48-71)</td></tr><tr><td align="left" valign="top">Female, n (%)</td><td align="left" valign="top">6902 (50.19)</td><td align="left" valign="top">4772 (49.87)</td></tr><tr><td align="left" valign="top">Black race, n (%)</td><td align="left" valign="top">1856 (13.50)</td><td align="left" valign="top">2018 (21.09)</td></tr><tr><td align="left" valign="top">Days from admission to AKI<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> onset (days), median (IQR)</td><td align="left" valign="top">7 (4-17)</td><td align="left" valign="top">5 (3-7)</td></tr><tr><td align="left" valign="top">Number of medication features</td><td align="left" valign="top">277</td><td align="left" valign="top">328</td></tr><tr><td align="left" valign="top">Number of procedure features</td><td align="left" valign="top">288</td><td align="left" valign="top">312</td></tr><tr><td align="left" valign="top">Number of lab test features</td><td align="left" valign="top">14</td><td align="left" valign="top">14</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>KUMC: University of Kansas Medical Center.</p></fn><fn id="table2fn2"><p><sup>b</sup>MCW: Medical College of Wisconsin.</p></fn><fn id="table2fn3"><p><sup>c</sup>AKI: acute kidney injury.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>AE Performance With Euclidean Distance</title><p>The fine-tuned and fixed model-specific hyperparameters are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. On the KUMC dataset, DAE consistently performed the best across both prediction targets (ie, AKI onset and 1-year mortality) and all k-NN neighborhood sizes (<italic>P</italic>&#x003C;.001,<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> and <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>), followed by vanilla AE and CAE (<xref ref-type="fig" rid="figure3">Figure 3</xref>, Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). The average performance of the top 5 hyperparameter configurations showed a similar trend, with DAE performing the best, followed by vanilla AE and CAE. For AKI onset prediction, CAE and SAE outperformed baseline models (ie, k-NN applied to the raw data and the PCA-transformed data) at <italic>k</italic>=15 and <italic>k</italic>=20 (<xref ref-type="fig" rid="figure3">Figure 3C</xref>) and performed comparably to the baseline models for 1-year mortality prediction (<xref ref-type="fig" rid="figure3">Figure 3D</xref>). The average best performance of RAE did not surpass that of the baseline models for 1-year mortality prediction (<xref ref-type="fig" rid="figure3">Figure 3D</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p><italic>F</italic><sub>1</sub>-scores of Euclidean-distance-based k-nearest neighbor models on the latent representations on the KUMC dataset. (<bold>A) </bold><italic>F</italic><sub>1</sub>-scores of predicting AKI onset. Each box represents the k-nearest neighbor <italic>F</italic><sub>1</sub>-scores with AE models trained with different hyperparameter configurations. (<bold>B) </bold><italic>F</italic><sub>1</sub>-scores of predicting 1-year mortality. (<bold>C) </bold>The mean <italic>F</italic><sub>1</sub>-scores of the top 5 best AE hyperparameter configurations of predicting AKI onset. (<bold>D) </bold>The mean <italic>F</italic><sub>1</sub>-scores of the top 5 best AE hyperparameter configurations of predicting 1-year mortality. AE: autoencoder; AKI: acute kidney injury; CAE: contractive autoencoder; DAE: denoising autoencoder; PCA: principal component analysis; RAE: robust autoencoder; SAE: sparse autoencoder.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68830_fig03.png"/></fig><p>The performance of AE variants on the MCW dataset exhibited a clear resemblance to the results obtained on the KUMC dataset, with DAE consistently outperforming the other models (<italic>P</italic>&#x003C;.001, <xref ref-type="fig" rid="figure4">Figures 4A</xref> and <xref ref-type="fig" rid="figure4">B</xref>). Correlation analysis showed that, across different hyperparameter settings, the <italic>F</italic><sub>1</sub>-scores of AE variants were significantly correlated between both datasets, with <italic>P</italic>&#x003E;.80 when predicting AKI onset (<xref ref-type="fig" rid="figure4">Figure 4C-F</xref>) and <italic>P</italic>&#x003E;.89 when predicting 1-year mortality (<xref ref-type="fig" rid="figure4">Figure 4G-J</xref>). Similar results were also observed when using AUPRC and AUROC as metrics (Figures S3 and S4 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p><italic>F</italic><sub>1</sub>-scores of Euclidean-distance-based k-nearest neighbor models on the latent representations on the MCW dataset, and the linear correlation between the <italic>F</italic><sub>1</sub>-scores on the KUMC and that on the MCW datasets. (<bold>A) </bold><italic>F</italic><sub>1</sub>-scores of predicting AKI onset. (<bold>B) </bold><italic>F</italic><sub>1</sub>-scores of predicting 1-year mortality.<bold> (C-F) </bold>Concordance in predicting AKI onset with varying neighborhood sizes. (<bold>G-J) </bold>Concordance in predicting 1-year mortality with varying neighborhood sizes. AE: autoencoder; AKI: acute kidney injury; CAE: contractive autoencoder; DAE: denoising autoencoder; KUMC: University of Kansas Medical Center; MCW: Medical College of Wisconsin; RAE: robust autoencoder; SAE: sparse autoencoder.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68830_fig04.png"/></fig></sec><sec id="s3-3"><title>Impact of Hyperparameters on AE Performance</title><p>On the KUMC dataset, when the neighborhood size was 5 and AKI onset was the prediction target, we observed that different hyperparameter settings had varying impacts on model performance. For learning rates, smaller values led to better performance for vanilla AE, DAE, and RAE. For CAE, a moderate learning rate yielded better results. At a higher learning rate (1E-2), the variance in model performance increased, with the upper bound observed in CAE and SAE outperforming those of other learning rates (<xref ref-type="fig" rid="figure5">Figure 5A</xref>). In terms of the optimizer, Adamax resulted in a slightly higher lower bound of model performance, while the upper bound showed no significant differences across optimizers (<xref ref-type="fig" rid="figure5">Figure 5B</xref>). Similarly, for latent dimensionality, higher dimensions (latent dimension: input dimension=0.75) led to a higher lower bound of model performance, with no significant differences observed in the upper bound (<xref ref-type="fig" rid="figure5">Figure 5C</xref>). No significant differences were observed between sigmoid and rectified linear unit activations (<xref ref-type="fig" rid="figure5">Figure 5D</xref>). Highly similar hyperparameter trends were observed on the KUMC dataset with neighborhood sizes of 10, 15, and 20 using <italic>F</italic><sub>1</sub>-scores as the metric (Figures S5-S7 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>), and with a neighborhood size of 5 using AUPRC and AUROC as metrics (Figures S8 and S9 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>), as well as on the MCW dataset with a neighborhood size of 5 (Figures S10-S12 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Impact of different AE hyperparameters on k-nearest neighbor model performance for predicting acute kidney injury onset on the University of Kansas Medical Center dataset. Each box plot shows the <italic>F</italic><sub>1</sub>-scores with Euclidean distance and a neighborhood size of 5, when one hyperparameter was fixed, while varying all other hyperparameters for each AE model. (<bold>A) </bold>Learning rates; (<bold>B) </bold>Optimizers; (<bold>C) </bold>The ratio of latent representation dimension to input data dimension; (<bold>D) </bold>Activation functions. AE: autoencoder; CAE: contractive autoencoder; DAE: denoising autoencoder; RAE: robust autoencoder; SAE: sparse autoencoder.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68830_fig05.png"/></fig></sec><sec id="s3-4"><title>AE Performance With Mahalanobis Distance</title><p>When comparing the performance of Euclidean distance-based k-NN and Mahalanobis distance-based k-NN on the KUMC dataset, we found that Mahalanobis distance-based k-NN generally performed better than Euclidean distance, except in a few cases (eg, DAE+NCA in <xref ref-type="table" rid="table3">Table 3</xref>). These performance drops primarily occur in NCA and MLKR, while LMNN consistently outperforms the Euclidean distance. This conclusion can be well generalized to the performance when controlling the latent dimension ratio (latent-to-input dimension ratio=0.5, <xref ref-type="table" rid="table4">Table 4</xref>) and the results on the MCW dataset (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendices 5</xref> and <xref ref-type="supplementary-material" rid="app6">6</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p><italic>F</italic><sub>1</sub>-scores of k-nearest neighbors (k-NNs) with Euclidean and Mahalanobis distances on the latent representations produced by the best-performing hyperparameter configuration of each autoencoder (AE) variant on the University of Kansas Medical Center dataset. The values are presented as mean (SD) of the 5-fold cross-validation.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Euclidean distance, mean (SD)</td><td align="left" valign="bottom" colspan="3">Mahalanobis distance, mean (SD)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">LMNN<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">NCA<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">MLKR<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Raw</td><td align="left" valign="top">0.284 (0.032)</td><td align="left" valign="top">0.314 (0.030)</td><td align="left" valign="top">0.325 (0.070)</td><td align="left" valign="top">0.381 (0.023)</td></tr><tr><td align="left" valign="top">AE</td><td align="left" valign="top">0.330 (0.030)</td><td align="left" valign="top">0.345 (0.046)</td><td align="left" valign="top">0.336 (0.052)</td><td align="left" valign="top">0.333 (0.029)</td></tr><tr><td align="left" valign="top">DAE<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">0.364 (0.027)</td><td align="left" valign="top">0.385 (0.025)</td><td align="left" valign="top">0.354 (0.033)</td><td align="left" valign="top">0.374 (0.031)</td></tr><tr><td align="left" valign="top">CAE<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">0.338 (0.028)</td><td align="left" valign="top">0.356 (0.035)</td><td align="left" valign="top">0.340 (0.037)</td><td align="left" valign="top">0.371 (0.038)</td></tr><tr><td align="left" valign="top">SAE<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">0.332 (0.032)</td><td align="left" valign="top">0.354 (0.028)</td><td align="left" valign="top">0.323 (0.021)</td><td align="left" valign="top">0.325 (0.034)</td></tr><tr><td align="left" valign="top">RAE<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="left" valign="top">0.344 (0.025)</td><td align="left" valign="top">0.412 (0.037)</td><td align="left" valign="top">0.375 (0.029)</td><td align="left" valign="top">0.401 (0.012)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>LMNN: large margin nearest neighbor.</p></fn><fn id="table3fn2"><p><sup>b</sup>NCA: neighborhood components analysis.</p></fn><fn id="table3fn3"><p><sup>c</sup>MLKR: Metric Learning for Kernel Regression.</p></fn><fn id="table3fn4"><p><sup>d</sup>DAE: denoising autoencoder.</p></fn><fn id="table3fn5"><p><sup>e</sup>CAE: contractive autoencoder.</p></fn><fn id="table3fn6"><p><sup>f</sup>SAE: sparse autoencoder.</p></fn><fn id="table3fn7"><p><sup>g</sup>RAE: robust autoencoder.</p></fn></table-wrap-foot></table-wrap><p>When comparing the performance of Mahalanobis distance-based k-NN on latent representations versus directly on raw data, no consistent pattern was observed. On the KUMC dataset, in most cases, Mahalanobis distance-based k-NN on latent representations outperformed its performance on the raw data. However, there was no fixed pattern for the optimal combination of AE variants and Mahalanobis distance estimation algorithms. When latent dimensions were not controlled, the combination of RAE and all 3 investigated Mahalanobis distance algorithms achieved the best performance (<xref ref-type="table" rid="table3">Table 3</xref>). In contrast, when latent dimensions were controlled, the combination of DAE with LMNN and NCA and the combination of RAE with MLKR performed the best (<xref ref-type="table" rid="table4">Table 4</xref>). On the MCW dataset, limited cases showed that Mahalanobis distance-based k-NN on latent representations outperformed its application on the raw data, indicating that this pattern is data-dependent (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendices 5</xref> and <xref ref-type="supplementary-material" rid="app6">6</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p><italic>F</italic><sub>1</sub>-scores of k-nearest neighbors (k-NNs) with Euclidean and Mahalanobis distances on the latent representations produced by the best-performing hyperparameter configuration, constrained by a latent-to-input dimension ratio of 0.5 for each autoencoder (AE) variant, on the University of Kansas Medical Center (KUMC) dataset.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Euclidean distance, mean (SD)</td><td align="left" valign="bottom" colspan="3">Mahalanobis distance, mean (SD)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">LMNN<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom">NCA<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="bottom">MLKR<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Raw</td><td align="left" valign="top">0.284 (0.032)</td><td align="left" valign="top">0.314 (0.030)</td><td align="left" valign="top">0.325 (0.070)</td><td align="left" valign="top">0.381 (0.023)</td></tr><tr><td align="left" valign="top">AE</td><td align="left" valign="top">0.320 (0.027)</td><td align="left" valign="top">0.368 (0.039)</td><td align="left" valign="top">0.330 (0.009)</td><td align="left" valign="top">0.337 (0.025)</td></tr><tr><td align="left" valign="top">DAE<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">0.353 (0.025)</td><td align="left" valign="top">0.378 (0.032)</td><td align="left" valign="top">0.360 (0.016)</td><td align="left" valign="top">0.348 (0.021)</td></tr><tr><td align="left" valign="top">CAE<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="left" valign="top">0.337 (0.016)</td><td align="left" valign="top">0.345 (0.023)</td><td align="left" valign="top">0.342 (0.029)</td><td align="left" valign="top">0.363 (0.028)</td></tr><tr><td align="left" valign="top">SAE<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">0.323 (0.040)</td><td align="left" valign="top">0.357 (0.051)</td><td align="left" valign="top">0.332 (0.039)</td><td align="left" valign="top">0.317 (0.035)</td></tr><tr><td align="left" valign="top">RAE<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">0.291 (0.030)</td><td align="left" valign="top">0.327 (0.048)</td><td align="left" valign="top">0.359 (0.029)</td><td align="left" valign="top">0.396 (0.032)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>LMNN: large margin nearest neighbor.</p></fn><fn id="table4fn2"><p><sup>b</sup>NCA: neighborhood components analysis.</p></fn><fn id="table4fn3"><p><sup>c</sup>MLKR: Metric Learning for Kernel Regression.</p></fn><fn id="table4fn4"><p><sup>d</sup>DAE: denoising autoencoder.</p></fn><fn id="table4fn5"><p><sup>e</sup>CAE: contractive autoencoder.</p></fn><fn id="table4fn6"><p><sup>f</sup>SAE: sparse autoencoder.</p></fn><fn id="table4fn7"><p><sup>g</sup>RAE: robust autoencoder.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Main Findings</title><p>This study makes significant contributions in three main areas: (1) it is the first to comprehensively evaluate the performance of different AEs specifically for EHR-based similar patient retrieval, providing critical insights to inform the design of AE-based patient representation learning models; (2) it is the first study to apply Mahalanobis distance to patient representations learned by AEs for similar patient retrieval, whereas previous studies have primarily relied on Euclidean distance; and (3) by establishing a fair and comprehensive evaluation framework, this study offers valuable guidance for AE model selection and hyperparameter tuning, contributing to the advancement of patient representation learning in EHR research.</p><p>Our findings indicate that DAE consistently outperformed other AEs, followed by vanilla AE and CAE, with RAE performing the worst. The superior performance of DAE likely stems from its mechanism of introducing noise into the original data during training, which encourages the model to prioritize encoding meaningful latent nonlinear relationships that are important for disease and outcome prediction, rather than focusing on noise. This process helps the model remain robust to noise, enabling more refined and abstracted patient representations, which improve the performance of the downstream k-NN model.</p><p>Other AE mechanisms, such as those in CAE and SAE, are also designed to produce effective representations and have performed well in regression and classification tasks in previous studies [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. However, they were less effective than DAE in retrieving similar patients. For SAE, the assumed Bernoulli distribution over the latent representations may mismatch the continuous outputs of the hidden layer, conflicting with the need for sufficient active neurons to preserve key data information. This trade-off between enforcing sparsity and preserving key data information can significantly degrade downstream k-NN performance, especially on complex data. The RAE was originally designed for image classification tasks [<xref ref-type="bibr" rid="ref28">28</xref>] and used an MCC loss rather than MSE to make the model more robust to outliers. However, this approach may not be well-suited to EHR data, which are often high-dimensional, sparse, noisy, and biased, resulting in its underperformance compared with other AE variants. Moreover, one of the prediction targets was AKI onset. Patients at high risk for developing AKI may show significant differences in lab results and medications compared with the general population. However, the MCC loss may weaken the encoding of this information in the latent representation, making the latent representations of high-risk patients with AKI less distinguishable from those in the normal population, which in turn leads to inaccuracies in similar patient retrieval (ie, higher false negative rates).</p><p>While both DAE and RAE are designed to enhance model robustness to noise, their underlying mechanisms differ significantly. DAE achieves this by explicitly injecting noise into the input data and training the model to &#x201C;denoise&#x201D; it, thereby encouraging the model to focus on capturing the intrinsic structure of the data. In contrast, RAE aims to improve robustness by making the model less sensitive to the input&#x2019;s tail distribution, which reduces the influence of outliers. However, this approach may prevent the model from fully capturing the true data distribution. This fundamental difference could be a key factor contributing to the performance gap observed between DAE and RAE in the specific scenarios examined in this study.</p><p>Interestingly, while previous research suggests AE models are highly sensitive to hyperparameter configurations [<xref ref-type="bibr" rid="ref24">24</xref>], our findings indicate that for the task of retrieving similar patients, only the learning rate significantly affected model performance. Specifically, smaller learning rates resulted in stronger lower-bound performance, as shown in <xref ref-type="fig" rid="figure5">Figure 5</xref>. However, larger learning rates (eg, 1E-2) could help the model escape local minima in certain situations, although not always guaranteed. Given the longer training times associated with smaller learning rates, we recommend using a moderate learning rate (eg, 1E-3 or 1E-4).</p><p>The impact of latent dimensions on model performance was minimal. Even with smaller latent dimensions (latent dimension to input dimension ratio=0.02), the latent representations remained expressive enough to ensure accurate patient similarity estimation. As shown in <xref ref-type="fig" rid="figure5">Figure 5</xref>, increasing the latent dimensions to accommodate additional information only slightly improved the model&#x2019;s lower-bound performance. However, this conclusion is highly dataset-dependent, and other datasets may require larger latent dimensions to capture more information.</p><p>Regarding the application of Mahalanobis distance to latent representations, it outperformed Euclidean distance in most cases. Mahalanobis distance applies a low-dimensional linear transformation to map the representations into a space where the margin between different classes is maximized while representations of the same class are pulled closer together, thereby enhancing the discriminative power of the downstream k-NN model [<xref ref-type="bibr" rid="ref41">41</xref>]. This result was expected, as Mahalanobis distance estimation treats latent representations as a separate dataset for learning appropriate linear feature transformations and distance measures. Compared with Euclidean distance, Mahalanobis distance is more likely to provide a more accurate estimation of vector similarity both within and between classes.</p><p>However, comparisons between Mahalanobis distance estimation algorithms on raw data and latent representations revealed that AE transformations of raw data did not always guarantee better performance. For example, AEs significantly improved the performance of Mahalanobis distance-based k-NN on the KUMC dataset (<xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>), but this improvement was limited on the MCW dataset (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendices 5</xref> and <xref ref-type="supplementary-material" rid="app6">6</xref>). There can be several reasons behind this. First, differences in the characteristics of the datasets could play a major role. Variations in feature distributions, data sparsity, or sample size between the KUMC and MCW datasets may affect the effectiveness of AEs in learning meaningful latent representations. If the KUMC dataset contains clearer structure or more consistent patterns, AEs may be able to capture more relevant patient representations compared with the MCW dataset.</p><p>Additionally, the alignment between the AE-learned latent representations and the assumptions underlying Mahalanobis distance could also contribute to the observed differences. AEs are typically optimized for reconstruction rather than directly preserving class separability or the local neighborhood structure needed for effective distance-based retrieval. As a result, the learned representations may not always be well-suited for Mahalanobis distance algorithms, particularly if the AE fails to retain key relationships present in the raw data (eg mapping all data points into an indistinguishable cluster in the latent space). This suggests that whether this complex transformation process enhances model performance is highly data-dependent.</p><p>It is also important to note that Mahalanobis distance estimation algorithms tend to have higher computational complexity compared with Euclidean distance. For example, the loss function of LMNN is non-convex, requiring the use of semidefinite programming techniques to address this challenge [<xref ref-type="bibr" rid="ref37">37</xref>]. Therefore, while the combination of &#x201C;AE + Mahalanobis distance&#x201D; can achieve optimal performance in certain cases, it is data-dependent and comes at the cost of increased computational complexity. To mitigate this burden, the dimensionality of the latent representation can be intentionally constrained, provided it maintains sufficient discriminative power for the downstream task.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study only investigated swap noise for DAE; other types of noise, such as Gaussian noise, may lead to different behaviors. Additionally, we only examined AKI onset and 1-year mortality, so the models&#x2019; performance may differ for other prediction tasks. For example, other studies have shown that DAE outperforms vanilla AE and RAE on multiple datasets and tasks, though not in all cases [<xref ref-type="bibr" rid="ref29">29</xref>]. Next, this study only compared AE variants trained in an unsupervised manner. Incorporating labels during training may help learn more effective latent representations compared with purely unsupervised approaches. Finally, while AEs can effectively capture complex patterns in high-dimensional clinical data, their latent representations are often difficult to interpret clinically, which may limit their utility in real-world decision-support settings. A disentangling framework should be further investigated and incorporated into the current AE model to enhance interpretability by isolating clinically meaningful latent factors, thereby facilitating more transparent integration into clinical decision support systems[<xref ref-type="bibr" rid="ref42">42</xref>].</p></sec><sec id="s4-3"><title>Conclusions</title><p>In this study, we assessed the performance of 5 AE variants&#x2014;vanilla AE, DAE, CAE, SAE, and RAE&#x2014;on 2 real-world EHR datasets, focusing on retrieving similar patients for personalized clinical decision-making. The study also explored the impact of different hyperparameter configurations on AE variants. Our results presented three key findings: (1) DAE generally performed best in retrieving similar patients when paired with Euclidean distance (<italic>P</italic>&#x003C;.001); (2) learning rates had the greatest impact on the performance of AE variants; and (3) applying Mahalanobis distance-based k-NN on latent representations can outperform Euclidean distance-based k-NN, although transforming raw data with AE variants did not always guarantee improved performance of Mahalanobis distance-based k-NN.</p></sec></sec></body><back><ack><p>This project was supported by the NIDDK award R01DK137881.</p></ack><notes><sec><title>Data Availability</title><p>The clinical data used for training and validation in this study is not publicly available and restrictions apply to its use. The deidentified multi-center electronic health record datasets in PCORnet Common Data Model may be made available by the Greater Plains Collaborative and PaTH clinical research networks, subjective to individual institution's and network-wide data governance and ethical approvals. The code for this study can be found at [<xref ref-type="bibr" rid="ref43">43</xref>].</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AE</term><def><p>autoencoder</p></def></def-item><def-item><term id="abb2">AKI</term><def><p>acute kidney injury</p></def></def-item><def-item><term id="abb3">AUPRC</term><def><p>area under the precision-recall curve</p></def></def-item><def-item><term id="abb4">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb5">CAE</term><def><p>contractive autoencoder</p></def></def-item><def-item><term id="abb6">DAE</term><def><p>denoising autoencoder</p></def></def-item><def-item><term id="abb7">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb8">k-NN</term><def><p>k-nearest neighbor</p></def></def-item><def-item><term id="abb9">KL</term><def><p>Kullback-Leibler</p></def></def-item><def-item><term id="abb10">LMNN</term><def><p>large margin nearest neighbor</p></def></def-item><def-item><term id="abb11">MCC</term><def><p>maximum correntropy criterion</p></def></def-item><def-item><term id="abb12">MCW</term><def><p>Medical College of Wisconsin</p></def></def-item><def-item><term id="abb13">MLKR</term><def><p>Metric Learning for Kernel Regression</p></def></def-item><def-item><term id="abb14">MSE</term><def><p>mean squared error</p></def></def-item><def-item><term id="abb15">NCA</term><def><p>neighborhood components analysis</p></def></def-item><def-item><term id="abb16">PCA</term><def><p>principal component analysis</p></def></def-item><def-item><term id="abb17">RAE</term><def><p>robust autoencoder</p></def></def-item><def-item><term id="abb18">SAE</term><def><p>sparse autoencoder</p></def></def-item><def-item><term id="abb19">SCr</term><def><p>serum creatinine</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McClellan</surname><given-names>J</given-names> </name><name name-style="western"><surname>King</surname><given-names>MC</given-names> </name></person-group><article-title>Genetic heterogeneity in human disease</article-title><source>Cell</source><year>2010</year><month>04</month><day>16</day><volume>141</volume><issue>2</issue><fpage>210</fpage><lpage>217</lpage><pub-id pub-id-type="doi">10.1016/j.cell.2010.03.032</pub-id><pub-id pub-id-type="medline">20403315</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cho</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Feldman</surname><given-names>M</given-names> </name></person-group><article-title>Heterogeneity of autoimmune diseases: pathophysiologic insights from genetics and implications for new therapies</article-title><source>Nat Med</source><year>2015</year><month>07</month><volume>21</volume><issue>7</issue><fpage>730</fpage><lpage>738</lpage><pub-id pub-id-type="doi">10.1038/nm.3897</pub-id><pub-id pub-id-type="medline">26121193</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Greenland</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Williams-Gray</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Barker</surname><given-names>RA</given-names> </name></person-group><article-title>The clinical heterogeneity of Parkinson&#x2019;s disease and its therapeutic implications</article-title><source>Eur J Neurosci</source><year>2019</year><month>02</month><volume>49</volume><issue>3</issue><fpage>328</fpage><lpage>338</lpage><pub-id pub-id-type="doi">10.1111/ejn.14094</pub-id><pub-id pub-id-type="medline">30059179</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rivera-Andrade</surname><given-names>A</given-names> </name><name name-style="western"><surname>Luna</surname><given-names>MA</given-names> </name></person-group><article-title>Trends and heterogeneity of cardiovascular disease and risk factors across Latin American and Caribbean countries</article-title><source>Prog Cardiovasc Dis</source><year>2014</year><volume>57</volume><issue>3</issue><fpage>276</fpage><lpage>285</lpage><pub-id pub-id-type="doi">10.1016/j.pcad.2014.09.004</pub-id><pub-id pub-id-type="medline">25218566</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Taubes</surname><given-names>G</given-names> </name></person-group><article-title>Epidemiology faces its limits: the search for subtle links between diet, lifestyle, or environmental factors and disease is an unending source of fear&#x2014;but often yields little certainty</article-title><source>Science</source><year>1995</year><volume>269</volume><issue>5221</issue><fpage>164</fpage><lpage>169</lpage><pub-id pub-id-type="doi">10.1126/science.7618077</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mathur</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sutton</surname><given-names>J</given-names> </name></person-group><article-title>Personalized medicine could transform healthcare</article-title><source>Biomed Rep</source><year>2017</year><month>07</month><volume>7</volume><issue>1</issue><fpage>3</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.3892/br.2017.922</pub-id><pub-id pub-id-type="medline">28685051</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chawla</surname><given-names>NV</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>DA</given-names> </name></person-group><article-title>Bringing big data to personalized healthcare: a patient-centered framework</article-title><source>J Gen Intern Med</source><year>2013</year><month>09</month><volume>28 Suppl 3</volume><issue>Suppl 3</issue><fpage>S660</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.1007/s11606-013-2455-8</pub-id><pub-id pub-id-type="medline">23797912</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Parimbelli</surname><given-names>E</given-names> </name><name name-style="western"><surname>Marini</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sacchi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bellazzi</surname><given-names>R</given-names> </name></person-group><article-title>Patient similarity for precision medicine: a systematic review</article-title><source>J Biomed Inform</source><year>2018</year><month>07</month><volume>83</volume><fpage>87</fpage><lpage>96</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2018.06.001</pub-id><pub-id pub-id-type="medline">29864490</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Development and validation of a personalized model with transfer learning for acute kidney injury risk estimation using electronic health records</article-title><source>JAMA Netw Open</source><year>2022</year><month>07</month><day>1</day><volume>5</volume><issue>7</issue><fpage>e2219776</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2022.19776</pub-id><pub-id pub-id-type="medline">35796212</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yim</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Wheeler</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Curtin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wagner</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Hernandez-Boussard</surname><given-names>T</given-names> </name></person-group><article-title>Secondary use of electronic medical records for clinical research: challenges and opportunities</article-title><source>Converg Sci Phys Oncol</source><year>2018</year><month>03</month><volume>4</volume><issue>1</issue><fpage>014001</fpage><pub-id pub-id-type="doi">10.1088/2057-1739/aaa905</pub-id><pub-id pub-id-type="medline">29732166</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Sequential data-based patient similarity framework for patient outcome prediction: algorithm development</article-title><source>J Med Internet Res</source><year>2022</year><month>01</month><day>6</day><volume>24</volume><issue>1</issue><fpage>e30720</fpage><pub-id pub-id-type="doi">10.2196/30720</pub-id><pub-id pub-id-type="medline">34989682</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A patient similarity network (CHDmap) to predict outcomes after congenital heart surgery: development and validation study</article-title><source>JMIR Med Inform</source><year>2024</year><month>01</month><day>19</day><volume>12</volume><fpage>e49138</fpage><pub-id pub-id-type="doi">10.2196/49138</pub-id><pub-id pub-id-type="medline">38297829</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Maslove</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Dubin</surname><given-names>JA</given-names> </name></person-group><article-title>Personalized mortality prediction driven by electronic medical data and a patient similarity metric</article-title><source>PLoS ONE</source><year>2015</year><volume>10</volume><issue>5</issue><fpage>e0127428</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0127428</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Generating sequential electronic health records using dual adversarial autoencoder</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>07</month><day>1</day><volume>27</volume><issue>9</issue><fpage>1411</fpage><lpage>1419</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa119</pub-id><pub-id pub-id-type="medline">32989459</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beaulieu-Jones</surname><given-names>BK</given-names> </name><name name-style="western"><surname>Moore</surname><given-names>JH</given-names> </name></person-group><article-title>Missing data imputation in the electronic health record using deeply learned autoencoders</article-title><source>Pac Symp Biocomput</source><year>2017</year><volume>22</volume><fpage>207</fpage><lpage>218</lpage><pub-id pub-id-type="doi">10.1142/9789813207813_0021</pub-id><pub-id pub-id-type="medline">27896976</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chowdhury</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name></person-group><article-title>Mixed pooling multi-view attention autoencoder for representation learning in healthcare</article-title><source>arXiv</source><comment>Preprint posted online on 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.06456</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sadati</surname><given-names>N</given-names> </name><name name-style="western"><surname>Nezhad</surname><given-names>MZ</given-names> </name><name name-style="western"><surname>Chinnam</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>D</given-names> </name></person-group><article-title>Representation learning with autoencoders for electronic health records: a comparative study</article-title><source>arXiv</source><comment>Preprint posted online on 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1801.02961</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bank</surname><given-names>D</given-names> </name><name name-style="western"><surname>Koenigstein</surname><given-names>N</given-names> </name><name name-style="western"><surname>Autoencoders</surname><given-names>GR</given-names> </name></person-group><source>Machine Learning for Data Science Handbook: Data Mining and Knowledge Discovery Handbook</source><year>2023</year><publisher-name>Springer</publisher-name><fpage>353</fpage><lpage>374</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-24628-9_16</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>W</given-names> </name></person-group><article-title>Auto-encoders in deep learning&#x2014;a review with new perspectives</article-title><source>Mathematics</source><year>2023</year><volume>11</volume><issue>8</issue><fpage>1777</fpage><pub-id pub-id-type="doi">10.3390/math11081777</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>P</given-names> </name><name name-style="western"><surname>Pei</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name></person-group><article-title>A comprehensive survey on design and application of autoencoder in deep learning</article-title><source>Appl Soft Comput</source><year>2023</year><month>05</month><volume>138</volume><fpage>110176</fpage><pub-id pub-id-type="doi">10.1016/j.asoc.2023.110176</pub-id><pub-id pub-id-type="medline">36531119</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jun</surname><given-names>CH</given-names> </name></person-group><article-title>A personalized classification model using similarity learning via supervised autoencoder</article-title><source>Appl Soft Comput</source><year>2022</year><month>12</month><volume>131</volume><fpage>109773</fpage><pub-id pub-id-type="doi">10.1016/j.asoc.2022.109773</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miotto</surname><given-names>R</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kidd</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Dudley</surname><given-names>JT</given-names> </name></person-group><article-title>Deep patient: an unsupervised representation to predict the future of patients from the electronic health records</article-title><source>Sci Rep</source><year>2016</year><month>05</month><day>17</day><volume>6</volume><fpage>26094</fpage><pub-id pub-id-type="doi">10.1038/srep26094</pub-id><pub-id pub-id-type="medline">27185194</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landi</surname><given-names>I</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>HC</given-names> </name><etal/></person-group><article-title>Deep representation learning of electronic health records to unlock patient stratification at scale</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><fpage>96</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-0301-z</pub-id><pub-id pub-id-type="medline">32699826</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Greene</surname><given-names>CS</given-names> </name></person-group><article-title>Parameter tuning is a key part of dimensionality reduction via deep variational autoencoders for single cell RNA transcriptomics</article-title><source>Pac Symp Biocomput</source><year>2019</year><volume>24</volume><fpage>362</fpage><lpage>373</lpage><pub-id pub-id-type="medline">30963075</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Vincent</surname><given-names>P</given-names> </name><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Manzagol</surname><given-names>PA</given-names> </name></person-group><article-title>Extracting and composing robust features with denoising autoencoders</article-title><conf-name>the 25th international conference</conf-name><conf-date>Jul 5, 2008</conf-date><conf-loc>Helsinki, Finland</conf-loc><fpage>1096</fpage><lpage>1103</lpage><pub-id pub-id-type="doi">10.1145/1390156.1390294</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Rifai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Dauphin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Vincent</surname><given-names>P</given-names> </name></person-group><article-title>A generative process for sampling contractive auto-encoders</article-title><source>arXiv</source><comment>Preprint posted online on 2012</comment><pub-id pub-id-type="doi">10.1007/978-3-642-23783-6_41</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ng</surname><given-names>A</given-names> </name></person-group><article-title>Sparse autoencoder</article-title><source>CS294A Lecture Notes</source><year>2011</year><access-date>2025-07-03</access-date><volume>72</volume><publisher-name>Stanford University</publisher-name><fpage>1</fpage><lpage>19</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://web.stanford.edu/class/cs294a/sparseAutoencoder_2011new.pdf">https://web.stanford.edu/class/cs294a/sparseAutoencoder_2011new.pdf</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Qi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Z</given-names> </name></person-group><article-title>Robust feature learning by stacked autoencoder with maximum correntropy criterion</article-title><conf-name>ICASSP 2014 - 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name><conf-date>May 4-9, 2014</conf-date><conf-loc>Florence, Italy</conf-loc><fpage>6716</fpage><lpage>6720</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2014.6854900</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pulgar</surname><given-names>FJ</given-names> </name><name name-style="western"><surname>Charte</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rivera</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>del Jesus</surname><given-names>MJ</given-names> </name></person-group><article-title>Choosing the proper autoencoder for feature fusion based on data complexity and classifiers: analysis, tips and guidelines</article-title><source>Information Fusion</source><year>2020</year><month>02</month><volume>54</volume><fpage>44</fpage><lpage>60</lpage><pub-id pub-id-type="doi">10.1016/j.inffus.2019.07.004</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Rathore</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Shukla</surname><given-names>VK</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Prakash</surname><given-names>J</given-names> </name></person-group><article-title>Hospital-acquired acute kidney injury in medical, surgical, and intensive care unit: a comparative study</article-title><source>Indian J Nephrol</source><year>2013</year><month>01</month><volume>23</volume><issue>1</issue><fpage>24</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.4103/0971-4065.107192</pub-id><pub-id pub-id-type="medline">23580801</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khwaja</surname><given-names>A</given-names> </name></person-group><article-title>KDIGO clinical practice guidelines for acute kidney injury</article-title><source>Nephron Clin Pract</source><year>2012</year><volume>120</volume><issue>4</issue><fpage>c179</fpage><lpage>84</lpage><pub-id pub-id-type="doi">10.1159/000339789</pub-id><pub-id pub-id-type="medline">22890468</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azur</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Stuart</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Frangakis</surname><given-names>C</given-names> </name><name name-style="western"><surname>Leaf</surname><given-names>PJ</given-names> </name></person-group><article-title>Multiple imputation by chained equations: what is it and how does it work?</article-title><source>Int J Methods Psychiatr Res</source><year>2011</year><month>03</month><volume>20</volume><issue>1</issue><fpage>40</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1002/mpr.329</pub-id><pub-id pub-id-type="medline">21499542</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Beaumont</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Rivest</surname><given-names>LP</given-names> </name></person-group><article-title>Dealing with outliers in survey data</article-title><source>Handbook of Statistics Elsevier</source><year>2009</year><publisher-name>Elsevier</publisher-name><fpage>247</fpage><lpage>279</lpage><pub-id pub-id-type="doi">10.1016/S0169-7161(08)00011-4</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name></person-group><article-title>Denoise autoencoder for tabular data</article-title><year>2021</year><access-date>2025-07-03</access-date><publisher-name>GitHub</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/serteal/denoisingautoencoders">https://github.com/serteal/denoisingautoencoders</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vincent</surname><given-names>P</given-names> </name><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lajoie</surname><given-names>I</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Manzagol</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Bottou</surname><given-names>L</given-names> </name></person-group><article-title>Stacked denoising autoencoders: learning useful representations in a deep network with a local denoising criterion</article-title><source>J Mach Learn Res</source><year>2010</year><volume>11</volume><issue>12</issue><pub-id pub-id-type="doi">10.5555/1756006.1953039</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pokharel</surname><given-names>PP</given-names> </name><name name-style="western"><surname>Principe</surname><given-names>JC</given-names> </name></person-group><article-title>Correntropy: properties and applications in non-gaussian signal processing</article-title><source>IEEE Trans Signal Process</source><year>2007</year><volume>55</volume><issue>11</issue><fpage>5286</fpage><lpage>5298</lpage><pub-id pub-id-type="doi">10.1109/TSP.2007.896065</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name><name name-style="western"><surname>Saul</surname><given-names>LK</given-names> </name></person-group><article-title>Distance metric learning for large margin nearest neighbor classification</article-title><source>J Mach Learn Res</source><year>2009</year><volume>10</volume><issue>2</issue><pub-id pub-id-type="doi">10.5555/1577069.1577078</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goldberger</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Roweis</surname><given-names>S</given-names> </name><name name-style="western"><surname>Salakhutdinov</surname><given-names>RR</given-names> </name></person-group><article-title>Neighbourhood components analysis</article-title><source>Adv Neural Inf Process Syst</source><year>2004</year><volume>17</volume><pub-id pub-id-type="doi">10.5555/2976040.2976105</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name><name name-style="western"><surname>Tesauro</surname><given-names>G</given-names> </name></person-group><article-title>Metric learning for kernel regression</article-title><access-date>2025-07-03</access-date><conf-name>Proceedings of Machine Learning Research</conf-name><conf-date>Mar 21-24, 2007</conf-date><conf-loc>San Juan, Puerto Rico</conf-loc><fpage>612</fpage><lpage>619</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v2/weinberger07a.html">https://proceedings.mlr.press/v2/weinberger07a.html</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pulgar</surname><given-names>FJ</given-names> </name><name name-style="western"><surname>Charte</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rivera</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>del Jesus</surname><given-names>MJ</given-names> </name></person-group><article-title>AEkNN: an autoencoder kNN&#x2013;based classifier with built-in dimensionality reduction</article-title><source>IJCIS</source><year>2018</year><volume>12</volume><issue>1</issue><fpage>436</fpage><pub-id pub-id-type="doi">10.2991/ijcis.2018.125905686</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vazelhes</surname><given-names>W</given-names> </name><name name-style="western"><surname>Carey</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Vauquier</surname><given-names>N</given-names> </name><name name-style="western"><surname>metric-learn</surname><given-names>BA</given-names> </name></person-group><article-title>Metric learning algorithms in python</article-title><source>J Mach Learn Res</source><year>2020</year><volume>21</volume><issue>138</issue><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.48550/arXiv.1908.04710</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hassija</surname><given-names>V</given-names> </name><name name-style="western"><surname>Chamola</surname><given-names>V</given-names> </name><name name-style="western"><surname>Mahapatra</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Interpreting black-box models: a review on explainable artificial intelligence</article-title><source>Cogn Comput</source><year>2024</year><month>01</month><volume>16</volume><issue>1</issue><fpage>45</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.1007/s12559-023-10179-8</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="web"><article-title>Autoencoder-based representation learning for similar patient retrieval from electronic health records: a comparative study</article-title><source>GitHub</source><access-date>2025-06-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/GatorAIM/AKI_AE_Compare">https://github.com/GatorAIM/AKI_AE_Compare</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Model-specific hyperparameters fine-tuned and fixed for downstream experiments. In DAE, &#x03C1; denotes the amount of noise added to the data. In CAE, &#x03BB; denotes the strength of the additional Frobenius norm penalty term. In SAE, &#x03C1; denotes the mean of the Bernoulli distribution, and &#x03B2; denotes the strength of the additional KL divergence penalty term. In RAE, &#x03C3; denotes the variance of the Gaussian distribution, &#x03C1; denotes the mean of the Bernoulli distribution, &#x03B2; denotes the strength of the additional KL divergence penalty term, and &#x03BB; denotes the strength of the weight decay. CAE: contractive autoencoder; RAE: robust autoencoder; SAE: sparse autoencoder.</p><media xlink:href="medinform_v13i1e68830_app1.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p><italic>P</italic> value tables showing the statistical significance of the <italic>F</italic><sub>1</sub>-score differences between different AE variants across the 168 different hyperparameter configurations of predicting AKI onset on the KUMC dataset, using a one-tailed paired <italic>t</italic> test. <italic>P</italic>&#x003C;.01 was considered statistically significant. Each entry represents the <italic>P</italic> value indicating whether the AE model in the row outperformed the AE model in the column.</p><media xlink:href="medinform_v13i1e68830_app2.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p><italic>P</italic> value tables showing the statistical significance of the <italic>F</italic><sub>1</sub>-score differences between various AE variants across 168 different hyperparameter configurations for predicting 1-year mortality on the KUMC dataset.</p><media xlink:href="medinform_v13i1e68830_app3.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Supplementary figures.</p><media xlink:href="medinform_v13i1e68830_app4.docx" xlink:title="DOCX File, 972 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p><italic>F</italic><sub>1</sub>-scores of k-NNs with Euclidean distance and with Mahalanobis distance on the latent representations produced by the best-performing hyperparameter configuration, constrained by a latent-to-input dimension ratio of 0.5 for each AE variant, on the MCW dataset.</p><media xlink:href="medinform_v13i1e68830_app5.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p><italic>F</italic><sub>1</sub>-scores of k-NNs with Euclidean distance and with Mahalanobis distance on the latent representations produced by the best-performing hyperparameter configuration of each AE variant on the MCW dataset.</p><media xlink:href="medinform_v13i1e68830_app6.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material></app-group></back></article>