<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v11i1e38590</article-id>
      <article-id pub-id-type="pmid">36662548</article-id>
      <article-id pub-id-type="doi">10.2196/38590</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Dealing With Missing, Imbalanced, and Sparse Features During the Development of a Prediction Model for Sudden Death Using Emergency Medicine Data: Machine Learning Approach</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Hefner</surname>
            <given-names>Jennifer</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Min</surname>
            <given-names>Lingtong</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Puladi</surname>
            <given-names>Behrus</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Xiaojie</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6415-5982</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Han</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8112-5961</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Nan</surname>
            <given-names>Shan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7807-3125</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Kong</surname>
            <given-names>Xiangtian</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8041-7591</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Duan</surname>
            <given-names>Huilong</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3893-213X</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Zhu</surname>
            <given-names>Haiyan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <address>
            <institution>First Medical Center of Chinese People's Liberation Army General Hospital</institution>
            <addr-line>28 Fuxing Road, Haidian District</addr-line>
            <addr-line>Beijing, 100037</addr-line>
            <country>China</country>
            <phone>86 13521361644</phone>
            <email>xiaoyanzibj301@163.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8285-4226</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Key Laboratory of Biomedical Engineering of Hainan Province</institution>
        <institution>School of Biomedical Engineering</institution>
        <institution>Hainan University</institution>
        <addr-line>Haikou</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Hainan Hospital of Chinese People's Liberation Army General Hospital</institution>
        <addr-line>Sanya</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>IMWare</institution>
        <addr-line>Wuhan</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>College of Biomedical Engineering and Instrumental Science</institution>
        <institution>Zhejiang University</institution>
        <addr-line>Hangzhou</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>First Medical Center of Chinese People's Liberation Army General Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Haiyan Zhu <email>xiaoyanzibj301@163.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>20</day>
        <month>1</month>
        <year>2023</year>
      </pub-date>
      <volume>11</volume>
      <elocation-id>e38590</elocation-id>
      <history>
        <date date-type="received">
          <day>12</day>
          <month>4</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>26</day>
          <month>7</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>20</day>
          <month>9</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>6</day>
          <month>12</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Xiaojie Chen, Han Chen, Shan Nan, Xiangtian Kong, Huilong Duan, Haiyan Zhu. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 20.01.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2023/1/e38590" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>In emergency departments (EDs), early diagnosis and timely rescue, which are supported by prediction modes using ED data, can increase patients’ chances of survival. Unfortunately, ED data usually contain missing, imbalanced, and sparse features, which makes it challenging to build early identification models for diseases.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to propose a systematic approach to deal with the problems of missing, imbalanced, and sparse features for developing sudden-death prediction models using emergency medicine (or ED) data.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We proposed a 3-step approach to deal with data quality issues: a random forest (RF) for missing values, k-means for imbalanced data, and principal component analysis (PCA) for sparse features. For continuous and discrete variables, the decision coefficient R<sup>2</sup> and the κ coefficient were used to evaluate performance, respectively. The area under the receiver operating characteristic curve (AUROC) and the area under the precision-recall curve (AUPRC) were used to estimate the model’s performance. To further evaluate the proposed approach, we carried out a case study using an ED data set obtained from the Hainan Hospital of Chinese PLA General Hospital. A logistic regression (LR) prediction model for patient condition worsening was built.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>A total of 1085 patients with rescue records and 17,959 patients without rescue records were selected and significantly imbalanced. We extracted 275, 402, and 891 variables from laboratory tests, medications, and diagnosis, respectively. After data preprocessing, the median R<sup>2</sup> of the RF continuous variable interpolation was 0.623 (IQR 0.647), and the median of the κ coefficient for discrete variable interpolation was 0.444 (IQR 0.285). The LR model constructed using the initial diagnostic data showed poor performance and variable separation, which was reflected in the abnormally high odds ratio (OR) values of the 2 variables of cardiac arrest and respiratory arrest (201568034532 and 1211118945, respectively) and an abnormal 95% CI. Using processed data, the recall of the model reached 0.746, the <italic>F</italic><sub>1</sub>-score was 0.73, and the AUROC was 0.708.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The proposed systematic approach is valid for building a prediction model for emergency patients.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>emergency medicine</kwd>
        <kwd>prediction model</kwd>
        <kwd>data preprocessing</kwd>
        <kwd>imbalanced data</kwd>
        <kwd>missing value interpolation</kwd>
        <kwd>sparse features</kwd>
        <kwd>clinical informatics</kwd>
        <kwd>machine learning</kwd>
        <kwd>medical informatics</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>In the emergency department (ED), early identification of high-risk patients can improve clinical decisions, avoid waste of resources, and lead to better patient prognosis [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. A prospective study showed that the incidence of adverse events due to improper emergency care is about 5%-10%, of which half can be prevented through early detection [<xref ref-type="bibr" rid="ref3">3</xref>]. However, early identification is difficult as these patients often show little obvious signs before rapid deterioration [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      <p>Prediction models for high-risk patients in EDs can greatly support caregivers [<xref ref-type="bibr" rid="ref5">5</xref>]. Electronic medical record (EMR) data, which fully capture patients’ status, are an important source for developing disease risk prediction models [<xref ref-type="bibr" rid="ref6">6</xref>]. As a typical high-risk disease in EDs, sudden death is a major public health problem worldwide, accounting for 15%-20% of all deaths [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. A previous study showed that cardiogenic diseases, potassium, mean platelet volume, creatinine, chloride, and sodium are important variables to predict the risk of death in patients [<xref ref-type="bibr" rid="ref5">5</xref>]. A survey showed that age, male, hypertension, diabetes, hypercholesterolemia, and a family history of coronary heart disease are all associated with increased risk of sudden death [<xref ref-type="bibr" rid="ref9">9</xref>]. A study evaluating the relationship between the variables of laboratory tests and the occurrence of acute death in patients found that serum sodium, glucose, and the leukocyte count show a U-shaped relationship with mortality [<xref ref-type="bibr" rid="ref10">10</xref>]. In addition, total bilirubin, creatine kinase, the international normalized ratio, aspartate aminotransferase, and lactate dehydrogenase are all risk factors associated with acute death in patients [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. However, the data quality of EMRs limits their effective use for developing prediction models [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Prediction of sudden death needs a variety of clinical data, which are frequently missing, imbalanced, and having sparse features.</p>
      <p>Missing values, imbalanced data, and sparse features are 3 common problems of EMR data. Missing values indicate not enough data collected due to improper use of the hospital information system or other reasons [<xref ref-type="bibr" rid="ref14">14</xref>]. Imbalanced data refer to the imbalanced distribution of negative and positive samples. This leads to more features of negative samples in the learning model, which is not suitable for the prediction of arbitrary patients [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Sparse features are zero features that are much larger than nonzero features and increase computing memory and reduce generalization ability [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Especially in small samples, a large amount of noise in sparse features makes model training impossible to converge. Therefore, tackling these quality issues of EMR data is an essential step to improve the predictive performance of machine learning (ML) models.</p>
      <p>To solve the aforementioned 3 problems, we propose a series of ML approaches to increase fitting ability and generalization ability. Using the approach, we developed a sudden-death predication model. The risk factors related to sudden death obtained through logistic regression (LR) model were consistent with the results reported in the earlier literature on the analysis of risk factors of in-hospital death. These results show that our data-preprocessing approach can effectively maintain the rich information contained in emergency data and provide a reliable data source for the development of a sudden-death prediction model.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>Our methods of data preprocessing consisted of 5 steps, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. The last 3 steps tackle 3 low-quality issues: missing values, imbalanced data, and sparse features. Finally, postprocessing data quality is evaluated by a sudden-death prediction model case study.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Workflow of ED data preprocessing and evaluation. ED: emergency department; EMR: electronic medical record.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e38590_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Collection and Cleaning</title>
        <p>Data for ED patient prediction model development are summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <p>Close investigation of each data table is required so as to know the location of our content of interest. For instance, data regarding a patient’s basic information are stored in the emg_visit table. Lab test items and results are stored in the lab_result and lab_master tables. The clinical record field in the emg_order table can be used to determine whether a sudden-death event occurred. One lab test (eg, blood test) can be performed multiple times to observe the patient status closely. Based on clinical experts’ opinions, only the last one is meaningful.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Description of the data table involved in the query process.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="280"/>
            <col width="720"/>
            <thead>
              <tr valign="top">
                <td>Table name</td>
                <td>Data description</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>emg_drug detail</td>
                <td>The patient's medication record, including the prescription number, drug name, dosage, drug specification, administration time, and administration route during the treatment period</td>
              </tr>
              <tr valign="top">
                <td>emg_drug_master</td>
                <td>Master record form of patient medication recording patient ID and prescription number</td>
              </tr>
              <tr valign="top">
                <td>emg_order</td>
                <td>Doctor’s order record form used to record the medication, inspection, diagnosis, treatment, and other doctor’s orders of the patient during treatment</td>
              </tr>
              <tr valign="top">
                <td>emg_visit</td>
                <td>Patient visit information table, including the patient's basic personal information, diagnosis of the current visit, triage, and other information</td>
              </tr>
              <tr valign="top">
                <td>lab_test_master</td>
                <td>Patient’s laboratory test master record form recording the patient’s age and gender information, laboratory test items made during the visit, and the corresponding doctor’s order ID</td>
              </tr>
              <tr valign="top">
                <td>lab_result</td>
                <td>Laboratory test results of patients, including test results of patients</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Variable Screening</title>
        <p>The number of variables obtained from the data collection was large, so screening of important variables facilitated final analysis. Two approaches can be adopted. One is based on statistical significance. The other is based on the specific research objective, opinions of medical experts, or authoritative literature [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. In our study, the first approach was taken. Variables with many missing values were filtered out using the threshold. For example, Alvarez et al [<xref ref-type="bibr" rid="ref19">19</xref>] set the threshold to 2%, while Seki et al [<xref ref-type="bibr" rid="ref20">20</xref>] set it to 25%. In this study, we set the threshold to 80%. This means that when 80% of the values of 1 variable are missing, that variable should be filtered out.</p>
      </sec>
      <sec>
        <title>Data Interpolation for Missing Values</title>
        <p>Missing values affect the effectiveness of ML models. Data missing show 3 different patterns: missing completely at random (MCAR), missing at random deletion (MAR), and not missing at random (MNAR). MCAR means that the missing of data is completely random and does not depend on observed or unobserved values [<xref ref-type="bibr" rid="ref21">21</xref>]. In this case, any interpolation method will not cause deviation. However, the assumption of MCAR in actual data is difficult to satisfy [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. MNAR and MAR mean that the missing of data depends on the unobserved value and does not depend on the unobserved value, respectively [<xref ref-type="bibr" rid="ref24">24</xref>]. However, it is impossible to infer whether the missing pattern belongs to MNAR or MAR through the existing data containing the missing pattern, and the assumption based on MAR is more consistent with the actual data situation [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. MAR allows us to estimate missing values using existing observation data in the data set [<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        <p>The goal of all kinds of interpolation methods is to reasonably estimate missing values and improve the quality of data. Interpolation methods are mainly divided into single interpolation and multiple interpolation. Multiple interpolation is a commonly used and better performance interpolation method. It generates multiple possible estimates for missing data and uses statistical inference to interpolate the final value. This method can reflect the randomness of missing data, and the interpolation error is smaller [<xref ref-type="bibr" rid="ref21">21</xref>]. In a single interpolation, interpolation methods, such as constants (ie, specific identifications), mean, median, and data distribution, can be used. However, such methods usually cause greater deviation [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. The single interpolation method based on ML has attracted increasingly more attention [<xref ref-type="bibr" rid="ref23">23</xref>], such as interpolation based on a clustering algorithm [<xref ref-type="bibr" rid="ref28">28</xref>], an ensemble model [<xref ref-type="bibr" rid="ref29">29</xref>], and Bayesian theory [<xref ref-type="bibr" rid="ref30">30</xref>]. Although multiple imputation can bring smaller deviation, when the frequency of missing data is high and the sample size is small, multiple imputation should be considered [<xref ref-type="bibr" rid="ref31">31</xref>]. However, its implementation is relatively complex, and it needs to involve the selection of an interpolation model and the number of interpolation data created [<xref ref-type="bibr" rid="ref32">32</xref>]. When the data are sufficient and the variability of the estimated value does not need to be considered, it is feasible to choose multiple imputation or single imputation [<xref ref-type="bibr" rid="ref31">31</xref>]. Considering that our sample size was relatively sufficient, to build a simpler interpolation method, we used a random forest (RF) [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>] as the interpolation algorithm to realize the interpolation of missing data in the form of a single interpolation.</p>
        <p>Altogether, the followed steps are proposed.</p>
        <list list-type="bullet">
          <list-item>
            <p>For variable “i,” 1 set of patient samples without missing values work as training samples and the other set of patient samples with missing values work as test samples.</p>
          </list-item>
          <list-item>
            <p>If other variables in the 2 samples are missing, the mean (continuous variable) or mode (discrete variable) is temporarily interpolated to form a complete sample.</p>
          </list-item>
          <list-item>
            <p>Use training samples to train RF models, the model is applied to test samples to predict missing values.</p>
          </list-item>
          <list-item>
            <p>For the next variable, steps 1, 2, and 3 are repeated until all variables of the whole sample are interpolated.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Processing Imbalanced Data</title>
        <p>Imbalanced data refer to the imbalanced distribution of negative and positive samples. For example, in the classification of rare diseases and credit predictions, there could be more negative samples than positive ones. Because most ML algorithms assume that categories (eg, positive or negative) of samples are evenly distributed, classifying models trained with imbalanced data are more likely to classify a new sample into the majority category [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>Basic solutions for imbalanced data are to use under- or oversampling to make the data balanced, such as random oversampling [<xref ref-type="bibr" rid="ref35">35</xref>], random undersampling, the synthetic minority oversampling technique (SMOTE) [<xref ref-type="bibr" rid="ref36">36</xref>], and the adaptive synthetic sampling method (ADASYN) [<xref ref-type="bibr" rid="ref15">15</xref>]. Although both undersampling and oversampling approaches can achieve data balance, the oversampling approach adds many sample copies to overfit the model. Wang and Japkowicz [<xref ref-type="bibr" rid="ref16">16</xref>] and Chawla et al [<xref ref-type="bibr" rid="ref36">36</xref>] also argued that undersampling is more favorable than oversampling in extreme imbalance situations. However, randomly discarding undersampling may also lose some representative samples. Segura-Bedmar et al [<xref ref-type="bibr" rid="ref37">37</xref>] and Lin et al [<xref ref-type="bibr" rid="ref38">38</xref>] proposed a clustering method to tackle this problem. The k-means considers the similarity between samples and uses the sample closest to the centroid of the cluster to approximate all the sample characteristics within the cluster, and the obtained samples are representative. The advantage of the clustering method over random undersampling is that all samples are used in the clustering process. This ensures that the information about all samples can be used to determine the sampling results and some important samples are not randomly discarded. In addition, we can adjust the number of clusters in k-means according to the actual data imbalance so as to achieve different undersampling ratios without other complex adjustments.</p>
        <p>To avoid the loss of important samples, we adopted k-means based on the Euclidean distance to cluster samples. New samples were generated though clustering, which had similar characteristics in the same cluster and were distinguished in the different clusters. The centroid of a cluster represents the overall characteristics of the whole cluster. In this way, important features are not discarded. Since the centroid of the cluster is calculated based on the average of the samples in the cluster, the centroid is not necessarily a real sample. So, we took the real samples with the smallest distance from the centroid.</p>
      </sec>
      <sec>
        <title>Processing Sparse Features</title>
        <p>Sparse features means that the feature index is much larger than the actual number of nonzero features. In total, there were 891 different types of diagnosis in our data set. However, for a single patient, the number of diagnoses was quite few. This formed sparse-feature phenomena.</p>
        <p>When sparse features occur, the sample is prone to having the problem of variable separation and multicollinearity. That is, a single variable or a linear combination of multiple variables can perfectly predict outcome events. However, this only works for small-size samples. It also leads to the situation in which the model gives an abnormally large weight to the variables and the results are unreliable [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Although there are many methods to optimize weights, such as gradient descent, a large number of zeros in features make the gradient tend to 0, and the parameters cannot be fully trained.</p>
        <p>The processing of sparse features can be considered from both the model and the data themselves. From the point of view of the model, the parameter estimation bias of high-dimensional sparse data can be reduced through the optimization of the algorithm. For example, Firth regression [<xref ref-type="bibr" rid="ref40">40</xref>] is used. The basic idea is to add a penalty term to the score function so as to reduce the deviation of the maximum-likelihood estimate of the parameter. This can solve the problem of variable separation and multicollinearity caused by sparse features to a large extent. From the point of view of the data themselves, it is necessary to transform the data to be processed into nonsparse data, and this transformation should retain the amount of information contained in the original data as much as possible. Considering the theme of our paper, our goal is to improve the quality of data rather than optimize the model algorithm. Therefore, we solved the problem of sparse features from the perspective of data. At present, there are many dimensionality reduction methods for high-dimensional sparse features, such as principal component analysis (PCA) [<xref ref-type="bibr" rid="ref39">39</xref>], singular value decomposition (SVD) [<xref ref-type="bibr" rid="ref41">41</xref>], and linear discriminant analysis (LDA) [<xref ref-type="bibr" rid="ref42">42</xref>]. The essence of these methods is to map the original data to a low-dimensional space through a specific transformation form to solve the problem of data sparsity. Among these methods, LDA needs to reduce dimensionality based on sample labels. Considering that the actual data may not be able to carry labels, and the difference in label definitions will greatly affect the dimensionality reduction results, this supervised dimensionality reduction method is not conducive to being extended to other data scenarios [<xref ref-type="bibr" rid="ref43">43</xref>]. Therefore, we considered using unsupervised dimensionality reduction methods, such as PCA, to transform our data.</p>
        <p>PCA has been widely used in analysis with high-dimensional sparse features [<xref ref-type="bibr" rid="ref44">44</xref>-<xref ref-type="bibr" rid="ref46">46</xref>]. PCA essentially transforms the feature space of the original sample so that the new feature is a linear combination of the original features. The basic principle of principal component (PC) selection is to keep the maximum variance, and all PCs are orthogonal to one another. Thus, the phenomenon of multicollinearity is avoided. Therefore, new samples no longer have sparse features, which makes the ML model better fit the parameters.</p>
        <p>In detail, new data can replace the original data as the input source for regression or classification models. Suppose <inline-graphic xlink:href="medinform_v11i1e38590_fig10.png" xlink:type="simple" mimetype="image"/>
 where each column represents a feature and each row is a sample. Assuming that the sample has been decentralized, <inline-graphic xlink:href="medinform_v11i1e38590_fig11.png" xlink:type="simple" mimetype="image"/> represents the covariance of matrix X. Let the transformed matrix Y = XV be D, which is derived as:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v11i1e38590_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>As C is a real symmetric matrix, according to the properties of the real symmetric matrix, its order m must have m unit orthogonal eigenvectors. That is, <inline-graphic xlink:href="medinform_v11i1e38590_fig13.png" xlink:type="simple" mimetype="image"/> is a matrix that can make the original covariance matrix similar to diagonalization. Therefore, by solving m eigenvalues and eigenvectors of <inline-graphic xlink:href="medinform_v11i1e38590_fig14.png" xlink:type="simple" mimetype="image"/>. By sorting the eigenvalues from large to small, we got λ = (λ<sub>1</sub>, λ<sub>2</sub>, …, λ<sub>m</sub>). There are the following relationships:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v11i1e38590_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Take the first k columns of V as the basis for transforming m-dimensional features into k-dimensional features and record it as <inline-graphic xlink:href="medinform_v11i1e38590_fig16.png" xlink:type="simple" mimetype="image"/> the transformed sample matrix is Y = XP.</p>
        <p>First, we manually merged similar diagnostic nouns according to prior knowledge, from 891 to 405. However, the data were obviously separated and sparse. For instance, none of the negative samples had a sudden cardiac arrest or sudden respiratory arrest diagnosis. Next, we only kept the diagnosis that appeared in more than 5% population. Finally, PCA was proposed for the remaining variables. The first 17 PCs that could explain 98.2% variance of the original sample were selected. Regression analysis was carried out on the samples after dimensionality reduction. The explanation of variables was achieved by counting the weight of the original variables on each PC.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>After preliminary review, the project was found to be in line with relevant medical ethics requirements. If it is funded by the Hainan Major Science and Technology Program in 2020, the Hainan Medical Ethics Committee will perform its duties and strictly abide by relevant regulations and requirements for medical ethics and informed consent of patients to ensure ethical supervision and review during the implementation of the project (reference number: 00824482406).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Data Preprocessing and Model Building</title>
        <p>A comprehensive evaluation was carried out on the ED data set of the Hainan Hospital of Chinese PLA General Hospital. We developed a set of Python programs to implement our methods. Specifically, the program was developed in Microsoft Windows 10 (Intel (R) core (TM) i5-9500 CPU, 3GHz). All data preprocessing and model building were completed in Python (Python 3.8 Anaconda) using multiple Python data science libraries, mainly including Numpy, Pandas, Matplotlib, and Scikit-learn. In addition, codes on data interpolation, imbalance correction, and PC regression are currently available on GitHub [<xref ref-type="bibr" rid="ref47">47</xref>].</p>
      </sec>
      <sec>
        <title>Data Collection and Cleaning</title>
        <p>We collected the data of patients who went to the ED of the Hainan Hospital of Chinese PLA General Hospital from July 27, 2017, to May 6, 2021. In the sudden-death group, the data of 1085 patients were collected. In the non-sudden-death group, the data of 17,959 patients were collected. For the analysis of laboratory test data, we excluded patients who did not have any laboratory test records before sudden death. A total of 108 (10%) patients were excluded, and 977 (90%) patients with sudden death were used for the analysis of laboratory test data. For diagnostic data, we excluded patients who were missing diagnostic data from the visit. Finally, there were 1083 patients with sudden death and 615 patients with nonsudden death. We developed statistics on the baseline data of all patients, as shown in Supplementary Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Distributions of age and gender are visualized in <xref rid="figure2" ref-type="fig">Figures 2</xref>-<xref rid="figure5" ref-type="fig">5</xref>.</p>
        <p>In the first group, there were 741 males (68.4%) and 342 females (31.6%), and 2 (0.2%) patients lacked gender information (<xref rid="figure2" ref-type="fig">Figure 2</xref>). The age varied between 45 and 80 years. The mean age was 56.4 years (SD 11.2). The quartile, median, and mode were 44, 59, and 68, respectively. In the second group, there were 9403 (52.4%) males and 8556 (47.6%) females. The age distribution is shown in <xref rid="figure4" ref-type="fig">Figures 4</xref> and <xref rid="figure5" ref-type="fig">5</xref>. The mean age was 41.6 years (SD 13.6). The quartile, median, and mode were 29, 42, and 48, respectively. For both groups, their distributions of age were akin to the normal distribution, which is consistent with a real-life situation.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Distribution of the gender of patients with sudden death.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e38590_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Distribution of the gender of patients without sudden death.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e38590_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Distribution of age of patients with sudden death.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e38590_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Distribution of patients of age with nonsudden death.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e38590_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Variable Screening</title>
        <p>To perform variable screening, that is, filtering out insignificant variables, we counted the total number of appearance and missing times. The second row of <xref ref-type="table" rid="table2">Table 2</xref> shows the number of patients who had no corresponding data in the individual category. Moreover, we investigated the reasons missing data exist in all the 3 categories. For instance, there were 108 (10%) patients having no laboratory test. Among them, we could not find lab test data for 33 (30.6%) patients. For the remaining 75 (69.4%) patients, their lab tests appeared after the sudden-death event. There were 287 (26.4%) patients having no medication data. Sudden death had occurred before the medication was given, and the medication was in the doctor’s order record, such as an epinephrine injection, but was not recorded in the patient’s medication table.</p>
        <p>There were 275 variables in the lab test category. For a given variable, not every patient (sample) had the value, namely a missing value. The missing ratio of a variable could be obtained by the number of cases having a missing value of that variable being divided by the total number of patients. The average ratio was 79.8%, as shown in the third row of <xref ref-type="table" rid="table2">Table 2</xref>. So, we set an 80% threshold to screen nonstatistically significant variables. Finally, 72 variables were kept in this category. These were patient age, gender, glucose, creatine kinase, inorganic phosphorus, total cholesterol, triglycerides, potassium, sodium, and calcium.</p>
        <p>For diagnosis, 891 different types of diagnosis were obtained after the initial data collection. Because the diagnosis is recorded in the form of free text, 1 diagnosis item could have several different synonyms. By merging these texts into a unified name via manual review, we obtained 405 variables. The number of confirmed patients of each diagnostic variable was counted. Instead of an 80% threshold, 5% was considered. Considering both positive and negative samples, 18 diagnostic variables were kept. Among them, 11 (61.1%) variables were shared by both. These were myocardial infarction, chest distress, sudden cardiac arrest, fever, rib fracture, renal dysfunction, chest pain, diabetes, abdominal pain, pulmonary infection, respiratory arrest, trauma, atrial fibrillation, disturbance of consciousness, cerebral hemorrhage, cerebral infarction, coronary heart disease, and hypertension.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Missing value ratios of variables of patients with sudden death.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Laboratory tests (275 variables)</td>
                <td>Medications (402 variables)</td>
                <td>Diagnosis (891 variables)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Patients without data, n (%)</td>
                <td>108 (10%)</td>
                <td>287 (26.4%)</td>
                <td>2 (0.18%)</td>
              </tr>
              <tr valign="top">
                <td>Average ratio of missing values</td>
                <td>79.8% (866/1085)</td>
                <td>72.4% (786/1085)</td>
                <td>99% (1080/1085)</td>
              </tr>
              <tr valign="top">
                <td>Maximum ratio of missing values</td>
                <td>90% (977/1085)</td>
                <td>73.5% (797/1085)</td>
                <td>100% (1085/1085)</td>
              </tr>
              <tr valign="top">
                <td>Minimum ratio of missing values</td>
                <td>25.8% (280/1085)</td>
                <td>48.5% (526/1085)</td>
                <td>58.4% (634/1085)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Data Interpolation, Processing Imbalanced Data, and Sparse Features</title>
        <p>In addition to age and gender, we used an RF to interpolate the missing values for each of the remaining variables. Nonmissing patient data were used as a training set to train the model to interpolate missing values. The training set was further split into training data (80%) and validation data (20%). The coefficient of determination R<sup>2</sup> and the κ coefficient were used to test the consistency of the imputation results of continuous variables and categorical variables. In the interpolation process, the median of R<sup>2</sup> was 0.623 (IQR 0.647) and the median of the κ coefficient was 0.444 (IQR 0.285).</p>
        <p>Due to the extreme imbalance of our original data, the number of patients with sudden death only accounted for 5% (977/18,936) of the total sample size. We generated 4 different data ratios (1:10, 1:5, 1:2, and 1:1) through k-means to achieve undersampling. These data were used with the original ratio to evaluate models of different data ratios and then to verify the rationality of our sampling method.</p>
      </sec>
      <sec>
        <title>Validation by a Sudden-Death Case Study</title>
        <sec>
          <title>Analyzing Risk Factors of Sudden Death</title>
          <p>We constructed an LR model to analyze the patients’ laboratory test variables using a data set with a data ratio of 1:1 as the data source to filter variables. To reflect the degree of correlation between variables, continuous variables were treated as ordinal categorical variables. Taking the normal index range of the variables as a reference point, the test results of the patients were mapped into 3 categories: L (index is lower than the normal value), N (index is normal), and H (index is higher than the normal value). To determine the significant factors affecting the sudden death of patients and avoid a negative effect on the final analysis results, we first performed the chi-square test to filter out the variables and then excluded variables when <italic>P</italic>&#62;.10. Next, LR univariate analysis was performed to filter out variables with <italic>P</italic>&#62;.05. <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>, respectively, show the variables excluded by the chi-square test and the LR univariate analysis, and their <italic>P</italic> values. We reintroduced some of the excluded variables into the final candidate variable set according to the literature review and the advice of consulting medical experts, including urine specific gravity, chloride, hematocrit, sodium, magnesium, lactate dehydrogenase, urine ketone body test, red blood cell count, and serum albumin. These variables have no significant statistical significance but are clinically related to sudden death. Finally, we selected 4 subgroups from the set of variables with significant statistical significance. In addition, variables not statistically significant but related to outcome events were also grouped separately. The final 5 groups were subjected to LR multivariate analysis, and the groups were as follows:</p>
          <list list-type="bullet">
            <list-item>
              <p>Group 1: qualitative test of creatinine, serum uric acid, urine protein</p>
            </list-item>
            <list-item>
              <p>Group 2: γ-glutamyl transferase, alanine aminotransferase, total bilirubin</p>
            </list-item>
            <list-item>
              <p>Group 3: international normalized ratio, platelet count, plasma prothrombin time</p>
            </list-item>
            <list-item>
              <p>Group 4: potassium, creatine kinase</p>
            </list-item>
            <list-item>
              <p>Group 5: urine specific gravity, chloride, hematocrit, sodium, magnesium, lactate dehydrogenase, urine ketone body test, red blood cell count, serum albumin</p>
            </list-item>
          </list>
          <p>For each group, 500-fold bootstrapping was used for model training and evaluation [<xref ref-type="bibr" rid="ref48">48</xref>]. Each bootstrap randomly split 70% of the data into the training set and 30% of the data into the test set. Finally, the mean values of AUROC, recall, and <italic>F</italic><sub>1</sub>-score for 500 training sessions in each group were reported, and the AUROC also reported the 95% CI. <xref ref-type="table" rid="table5">Table 5</xref> illustrates the model evaluation results of the 5 groups of variables. The performance parameters of group 2 were the best among the 5 groups of variables. In the recognition of patients with sudden death, a recall rate of 0.801 was obtained, the <italic>F</italic><sub>1</sub>-score was 0.835, and the model’s AUROC was 0.843 (95% CI 0.842-0.844). The results showed that this set of variables can better identify patients with sudden death. Therefore, other group variables based on the group 2 variables were added successively, and AUROC was taken as the evaluation index. The added variables would be included in the final model if AUROC could be improved. In the end, 13 laboratory test risk variables related to sudden death events were determined, and the patient’s gender variable was retained as a demographic feature. In general, the final variables used included γ-glutamyl transferase, alanine aminotransferase, total bilirubin, creatinine, serum uric acid, the international standardized ratio, creatine kinase, the platelet count, potassium, sex, sodium, magnesium, chloride, and serum albumin. These variables were used to build the final LR model. <xref ref-type="table" rid="table6">Table 6</xref> shows the results of LR multivariate analysis.</p>
          <p>After determining the patient features for analysis, we split the original scale data into a training set (70%) and a test set (30%). For the training set, 4 different categories of data sets (1:1, 1:2, 1:5, 1:10) were formed by undersampling to train the model. Finally, the performance of the model was evaluated on the test set. The mean and 95% CI (500-fold bootstrapping) of the final AUROC, AUPRC, <italic>F</italic><sub>1</sub>-score, and recall are shown in Supplementary Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In addition, we further used Brier scores to evaluate the calibration ability of models trained with different data ratios.</p>
          <p>In general, as the data ratio tended to balance, the performance of the model gradually improved. <xref rid="figure6" ref-type="fig">Figures 6</xref> and <xref rid="figure7" ref-type="fig">7</xref> show the model receiver operating characteristic (ROC) curve (<xref rid="figure6" ref-type="fig">Figure 6</xref>) and the precision-recall (PR) curve (<xref rid="figure7" ref-type="fig">Figure 7</xref>) of the 4 data ratios. In recognizing patients with sudden death, the best model obtained a recall rate of 0.863 (95% CI 0.862-0.865), the <italic>F</italic><sub>1</sub>-score was 0.84 (95% CI 0.839-0.842), the AUROC of the model was 0.895 (95% CI 0.894-0.896), and the AUPRC was 0.897 (95% CI 0.896-0.899). The original scale data model performed the worst, with an AUROC of 0.812 (95% CI 0.811-0.813) and an AUPRC of 0.407 (95% CI 0.404-0.409). We plotted the reliability curves of 5 training sets with different data ratios on the same test set and calculated Brier scores  (Supplementary Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Consistent with the viewpoint mentioned by Geeven et al [<xref ref-type="bibr" rid="ref49">49</xref>], imbalance correction actually weakened the clinical application value of the model, which was mainly manifested in the poor calibration ability of the model. With the increase in sampling, the calibration of the model was worse and the Brier score was 0.16 and 0.108 in the data ratio of 1:1 and the original data ratio, respectively. Imbalance correction can balance the sensitivity and specificity of the model to a greater extent and avoid biased errors in the model. Undersampling optimizes the AUROC, <italic>F</italic><sub>1</sub>-score, and AUPRC of the model trained by the proportion of the original data. Although the Brier score in calibration improved, the gap was not large. To observe the risk factors of sudden death in patients more intuitively, we visualized the regression coefficients of the best model after performing LR(<xref rid="figure8" ref-type="fig">Figure 8</xref>) to observe the relationship between variables and sudden-death events.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Statistics of variables filtered by the chi-square test.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="570"/>
              <col width="280"/>
              <col width="150"/>
              <thead>
                <tr valign="top">
                  <td>Variable</td>
                  <td><italic>χ</italic><sup>2</sup> (<italic>df</italic>)</td>
                  <td><italic>P</italic> value</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Monocytes</td>
                  <td>5.433 (6)</td>
                  <td>.49</td>
                </tr>
                <tr valign="top">
                  <td>Basophil</td>
                  <td>0.705 (4)</td>
                  <td>.95</td>
                </tr>
                <tr valign="top">
                  <td>Eosinophils</td>
                  <td>0.977 (4)</td>
                  <td>.91</td>
                </tr>
                <tr valign="top">
                  <td>Urine specific gravity determination</td>
                  <td>0 (2)</td>
                  <td>.99</td>
                </tr>
                <tr valign="top">
                  <td>Urine tube type</td>
                  <td>1.25 (4)</td>
                  <td>.87</td>
                </tr>
                <tr valign="top">
                  <td>Urine tube type (microscopic examination)</td>
                  <td>6.863 (8)</td>
                  <td>.98</td>
                </tr>
                <tr valign="top">
                  <td>Qualitative test of urinary bilirubin</td>
                  <td>13.185 (4)</td>
                  <td>.21</td>
                </tr>
                <tr valign="top">
                  <td>Mean erythrocyte hemoglobin concentration</td>
                  <td>7.828 (6)</td>
                  <td>.25</td>
                </tr>
                <tr valign="top">
                  <td>Chloride</td>
                  <td>4.649 (6)</td>
                  <td>.59</td>
                </tr>
                <tr valign="top">
                  <td>Erythrocyte volume distribution width measurement coefficient of variation (CV)</td>
                  <td>1.148 (4)</td>
                  <td>.89</td>
                </tr>
                <tr valign="top">
                  <td>Hematocrit assay</td>
                  <td>4.982 (6)</td>
                  <td>.55</td>
                </tr>
                <tr valign="top">
                  <td>Sodium</td>
                  <td>7.915 (6)</td>
                  <td>.24</td>
                </tr>
                <tr valign="top">
                  <td>Magnesium</td>
                  <td>10.22 (6)</td>
                  <td>.12</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Statistics of variables screened by LR<sup>a</sup> univariate analysis.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="300"/>
              <col width="260"/>
              <col width="310"/>
              <col width="130"/>
              <thead>
                <tr valign="top">
                  <td>Variable</td>
                  <td>Reference range</td>
                  <td>OR<sup>b</sup> (95% CI)</td>
                  <td><italic>P</italic> value</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Lactate dehydrogenase</td>
                  <td>50.0-150.0 U/L</td>
                  <td>1.029 (0.94-1.127)</td>
                  <td>.53</td>
                </tr>
                <tr valign="top">
                  <td>Urine ketone body test</td>
                  <td>N/A<sup>c</sup></td>
                  <td>0.912 (0.769-1.081)</td>
                  <td>.29</td>
                </tr>
                <tr valign="top">
                  <td>Red blood cell count</td>
                  <td>3.5-5.9 1012/L</td>
                  <td>0.827 (0.642-1.065)</td>
                  <td>.14</td>
                </tr>
                <tr valign="top">
                  <td>Serum albumin</td>
                  <td>35.0-50.0 g/L</td>
                  <td>0.893 (0.689-1.157)</td>
                  <td>.39</td>
                </tr>
                <tr valign="top">
                  <td>High-density lipoprotein cholesterol</td>
                  <td>1.0-1.6 mmol/L</td>
                  <td>0.961 (0.749-1.232)</td>
                  <td>.75</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>LR: logistic regression.</p>
              </fn>
              <fn id="table4fn2">
                <p><sup>b</sup>OR: odds ratio.</p>
              </fn>
              <fn id="table4fn3">
                <p><sup>c</sup>N/A: not applicable.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Comparing the performance of 5 groups of variables.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="140"/>
              <col width="270"/>
              <col width="180"/>
              <col width="410"/>
              <thead>
                <tr valign="top">
                  <td>Group</td>
                  <td>Recall</td>
                  <td><italic>F</italic><sub>1</sub>-score</td>
                  <td>AUROC<sup>a</sup> (95% CI)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>0.478</td>
                  <td>0.6</td>
                  <td>0.683 (0.681-0.684)</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>0.801</td>
                  <td>0.835</td>
                  <td>0.843 (0.842-0.844)</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>0.606</td>
                  <td>0.687</td>
                  <td>0.725 (0.724-0.727)</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>0.484</td>
                  <td>0.605</td>
                  <td>0.686 (0.685-0.687)</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>0.852</td>
                  <td>0.651</td>
                  <td> 0.562 (0.561-0.564)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table5fn1">
                <p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table6">
            <label>Table 6</label>
            <caption>
              <p>LR<sup>a</sup> multivariate analysis.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="280"/>
              <col width="370"/>
              <col width="350"/>
              <thead>
                <tr valign="top">
                  <td>Variable</td>
                  <td>Reference range</td>
                  <td>OR<sup>b</sup> (95% CI)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>γ-Glutamyl transferase</td>
                  <td>0.0-50.0 U/L</td>
                  <td>0.225 (0.222-0.228)</td>
                </tr>
                <tr valign="top">
                  <td>Alanine aminotransferase</td>
                  <td>5.0-40.0 U/L</td>
                  <td>1.828 (1.804-1.852)</td>
                </tr>
                <tr valign="top">
                  <td>Total bilirubin</td>
                  <td>0.0-21.0 μmol/L</td>
                  <td>19.954 (19.7-20.2)</td>
                </tr>
                <tr valign="top">
                  <td>Creatinine</td>
                  <td>30.0-110.0 μmol/L</td>
                  <td>1.352 (1.331-1.372)</td>
                </tr>
                <tr valign="top">
                  <td>Serum uric acid</td>
                  <td>104.0-444.0 μmol/L</td>
                  <td>1.346 (1.334-1.359)</td>
                </tr>
                <tr valign="top">
                  <td>International normalized ratio</td>
                  <td>0.8-1.2</td>
                  <td>2.23 (2.188-2.272)</td>
                </tr>
                <tr valign="top">
                  <td>Creatine kinase</td>
                  <td>24.0-320.0 U/L</td>
                  <td>2.457 (2.431-2.483)</td>
                </tr>
                <tr valign="top">
                  <td>Platelet count</td>
                  <td>100.0-300.0 ×10<sup>9</sup>/L</td>
                  <td>0.623 (0.617-0.629)</td>
                </tr>
                <tr valign="top">
                  <td>Potassium</td>
                  <td>3.5-5.1 mmol/L</td>
                  <td>1.057 (1.043-1.07)</td>
                </tr>
                <tr valign="top">
                  <td>Gender</td>
                  <td>Female</td>
                  <td>0.183 (0.182-0.184)</td>
                </tr>
                <tr valign="top">
                  <td>Sodium</td>
                  <td>135-145 mmol/L</td>
                  <td>2.182 (2.102-2.262)</td>
                </tr>
                <tr valign="top">
                  <td>Magnesium</td>
                  <td>0.8-1.0 mmol/L</td>
                  <td>4.807 (4.587-5.027)</td>
                </tr>
                <tr valign="top">
                  <td>Chloride</td>
                  <td>96.00-106.00 mmol/L</td>
                  <td>0.615 (0.603-0.627)</td>
                </tr>
                <tr valign="top">
                  <td>Serum albumin</td>
                  <td>35-51g/L</td>
                  <td>1.284 (1.268-1.3)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table6fn1">
                <p><sup>a</sup>LR: logistic regression.</p>
              </fn>
              <fn id="table6fn2">
                <p><sup>b</sup>OR: odds ratio.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure6" position="float">
            <label>Figure 6</label>
            <caption>
              <p>ROC curves of different data ratio. AUC: area under the curve; ROC: receiver operating characteristic.</p>
            </caption>
            <graphic xlink:href="medinform_v11i1e38590_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure7" position="float">
            <label>Figure 7</label>
            <caption>
              <p>PR curves of different data ratio. AUPRC: area under the precision-recall curve; PR: precision-recall.</p>
            </caption>
            <graphic xlink:href="medinform_v11i1e38590_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure8" position="float">
            <label>Figure 8</label>
            <caption>
              <p>Visualization of logistic regression coefficients.</p>
            </caption>
            <graphic xlink:href="medinform_v11i1e38590_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Development of Other ML Models</title>
          <p>We use interpolated and undersampled data (data ratio 1:1) to train several other ML models and evaluate their performance. The training models included an RF [<xref ref-type="bibr" rid="ref50">50</xref>], a gradient boosting machine (GBM) [<xref ref-type="bibr" rid="ref51">51</xref>], a support vector machine (SVM) [<xref ref-type="bibr" rid="ref52">52</xref>], and least absolute shrinkage and selection operator (LASSO) [<xref ref-type="bibr" rid="ref53">53</xref>], which are also often used to develop medical prediction models [<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref54">54</xref>]. We use 500-fold bootstrapping for internal validation. Each bootstrap used 70% data for training and the remaining 30% data for performance evaluation. The area under the curve (AUC), AUPRC, recall, and <italic>F</italic><sub>1</sub>-score and their 95% CI values were reported. Before model training, a grid search was conducted to tune the best hyperparameter of each model through 5-fold cross-validation. The hyperparameter settings of each model are shown in Supplementary Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The ROC curve and PR curve of the models are shown in Supplementary Figures S2 and S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, respectively, and the performance evaluation results are shown in Supplementary Table S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In general, the performance of the RF and GBM with an integrated scheme was the best, with an AUC of 0.936 (95% CI 0.934-0.937) and 0.931 (95% CI 0.93-0.932),respectively, and an <italic>F</italic><sub>1</sub>-score of 0.857 (95% CI 0.856-0.858) and 0.821 (95% CI 0.82-0.823), respectively. This can benefit from the generalization and the ability to deal with complex feature relationships of the integrated model. The comprehensive decision results of multiple base learners are more stable than the single-model prediction results, and the performance is better. The SVM also performed better than the LR and LASSO, which are linear models, with an AUC of 0.913 (95% CI 0.912-0.914). This shows that there are some nonlinear features we used that made the linear model insufficient to recognize the relationship between these features.</p>
        </sec>
        <sec>
          <title>Diagnostic Data Analysis Results</title>
          <p>The final sample included 1083 patients with sudden death and 615 patients with nonsudden death. <xref ref-type="table" rid="table7">Table 7</xref> shows the number of confirmed patients with 18 variables. The final diagnostic variables used included hypertension, myocardial infarction, cerebral hemorrhage, cardiac arrest, absolute pain, atmospheric fabric, fever, trauma, respiratory arrest, diabetes, corporate heart disease, and cerebral infarction.</p>
          <p>We used 500-fold bootstrapping for internal validation of the model. For each bootstrap, 70% of the samples were randomly selected as the training set and 30% as the test set to evaluate the model. The final reported model performance was the mean and 95% CI of 500 results [<xref ref-type="bibr" rid="ref48">48</xref>].</p>
          <p>The first 17 PCs that could explain 98.2% of the variance of the original sample were selected as new variables for analysis. To observe the role of PCA, we compared the 2 schemes: the LR model using the original data and the LR model after dimensionality reduction using PCA. The LR model trained with the original data obtained a recall rate of 0.445 (95% CI 0.443-0.448), an <italic>F</italic><sub>1</sub>-score of 0.562 95% CI 0.56-0.564), and an AUROC of 0.602 (95% CI 0.6-0.603). After PCA dimensionality reduction of the original data, the PC variable was used as the data source to train the LR model, and a recall rate of 0.746 (95% CI 0.731-0.76) was obtained, the <italic>F</italic><sub>1</sub>-score was 0.73 (95% CI 0.721-0.738), and the AUROC of the model was 0.708 (95% CI 0.707-0.71). <xref rid="figure9" ref-type="fig">Figure 9</xref> shows the ROC curves of the 2 models. The LR model using the original data had the phenomenon of variable separation, which is reflected in the abnormally high OR values of cardiac arrest and respiratory arrest (201568034532 and 1211118945) and an abnormal 95% CI, which makes the results unreliable. In addition, the performance of the model was poor, and only a recall rate of 0.445 was obtained in the identification of patients with sudden death, which means that the identification ability of the model for patients with sudden death is not strong. After PCA dimensionality reduction, the data were no longer sparse, the model parameters were better fitted, and the model performance improved to a certain extent. In addition, data conversion also eliminated the problems of variable separation and multicollinearity.</p>
          <p>To determine the impact of various diagnostic variables on the sudden death of emergency patients, we statistically analyzed the results of multivariate analysis on 17 PCs input into the LR model. The OR of PC4, PC5, and PC6 was 3.044, 2.859, and 3.931, respectively, showing a significant correlation with sudden-death events (<xref ref-type="table" rid="table8">Table 8</xref>). In each PC, the magnitude of the loading, the elements in the PC, reflected the importance of the original variable in the PC (Supplementary Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The loadings of all components showed that cerebral infarction, hypertension, and pulmonary infection were the top 3 variables in PC4. In PC5 and PC6, the top 3 variables were consciousness disorder, diabetes, and fever. Based on the results of the 3 PCs, we believe that the 6 diagnoses of cerebral infarction, hypertension, pulmonary infection, consciousness disorder, diabetes, and fever are significantly associated with sudden death in emergency patients.</p>
          <table-wrap position="float" id="table7">
            <label>Table 7</label>
            <caption>
              <p>Statistics of people diagnosed.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="300"/>
              <col width="700"/>
              <thead>
                <tr valign="top">
                  <td>Variable</td>
                  <td>People with sudden death diagnosed, n (%)/people with nonsudden death diagnosed, n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Myocardial infarction</td>
                  <td>57 (5.26)/23 (3.74)</td>
                </tr>
                <tr valign="top">
                  <td>Chest tightness</td>
                  <td>8 (0.74)/35 (5.69)</td>
                </tr>
                <tr valign="top">
                  <td>Cardiac arrest</td>
                  <td>120 (11.08)/0</td>
                </tr>
                <tr valign="top">
                  <td>Fever</td>
                  <td>50 (4.62)/43 (6.99)</td>
                </tr>
                <tr valign="top">
                  <td>Rib fracture</td>
                  <td>58 (5.36)/3 (0.49)</td>
                </tr>
                <tr valign="top">
                  <td>Abnormal renal function</td>
                  <td>42 (3.88)/35 (5.69)</td>
                </tr>
                <tr valign="top">
                  <td>Chest pain</td>
                  <td>18 (1.66)/38 (6.18)</td>
                </tr>
                <tr valign="top">
                  <td>Diabetes</td>
                  <td>65 (6.00)/66 (10.73)</td>
                </tr>
                <tr valign="top">
                  <td>Abdominal pain</td>
                  <td>30 (2.77)/45 (7.32)</td>
                </tr>
                <tr valign="top">
                  <td>Pulmonary infection</td>
                  <td>85 (7.85)/64 (10.41)</td>
                </tr>
                <tr valign="top">
                  <td>Respiratory arrest</td>
                  <td>106 (9.79)/0</td>
                </tr>
                <tr valign="top">
                  <td>Trauma</td>
                  <td>58 (5.36)/16 (2.60)</td>
                </tr>
                <tr valign="top">
                  <td>Atrial fibrillation</td>
                  <td>39 (3.60)/33 (5.37)</td>
                </tr>
                <tr valign="top">
                  <td>Disturbance of consciousness</td>
                  <td>82 (7.57)/17 (2.76)</td>
                </tr>
                <tr valign="top">
                  <td>Cerebral hemorrhage</td>
                  <td>77 (7.11)/26 (4.23)</td>
                </tr>
                <tr valign="top">
                  <td>Cerebral infarction</td>
                  <td>75 (6.93)/71 (11.54)</td>
                </tr>
                <tr valign="top">
                  <td>Coronary heart disease</td>
                  <td>29 (2.68)/39 (6.34)</td>
                </tr>
                <tr valign="top">
                  <td>Hypertension</td>
                  <td>65 (6.00)/106 (17.24)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <fig id="figure9" position="float">
            <label>Figure 9</label>
            <caption>
              <p>ROC curves of 2 models. AUC: area under the curve; LR: logistic regression; PCA: principal component analysis; ROC: receiver operating characteristic.</p>
            </caption>
            <graphic xlink:href="medinform_v11i1e38590_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <table-wrap position="float" id="table8">
            <label>Table 8</label>
            <caption>
              <p>PC<sup>a</sup> regression results</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="200"/>
              <col width="800"/>
              <thead>
                <tr valign="top">
                  <td>PC</td>
                  <td>OR<sup>b</sup> (95% CI)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>0.239 (0.235-0.242</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>2.429 (2.383-2.476)</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>1.19 (1.126-1.253)</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>3.044 (2.948-3.141)</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>2.859 (2.687-3.031)</td>
                </tr>
                <tr valign="top">
                  <td>6</td>
                  <td>3.931 (3.714-4.148)</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>1.49 (1.405-1.575)</td>
                </tr>
                <tr valign="top">
                  <td>8</td>
                  <td>1.699 (1.562-1.836)</td>
                </tr>
                <tr valign="top">
                  <td>9</td>
                  <td>2.104 (1.949-2.259)</td>
                </tr>
                <tr valign="top">
                  <td>10</td>
                  <td>2.153 (2.016-2.289)</td>
                </tr>
                <tr valign="top">
                  <td>11</td>
                  <td>2.451 (2.191-2.711)</td>
                </tr>
                <tr valign="top">
                  <td>12</td>
                  <td>2.031 (1.855-2.206)</td>
                </tr>
                <tr valign="top">
                  <td>13</td>
                  <td>1.457 (1.339-1.575)</td>
                </tr>
                <tr valign="top">
                  <td>14</td>
                  <td>0.949 (0.863-1.034)</td>
                </tr>
                <tr valign="top">
                  <td>15</td>
                  <td>1.423 (1.231-1.614)</td>
                </tr>
                <tr valign="top">
                  <td>16</td>
                  <td>2.546 (2.221-2.871)</td>
                </tr>
                <tr valign="top">
                  <td>17</td>
                  <td> 0.182 (0.164-0.201)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table8fn1">
                <p><sup>a</sup>PC: principal component.</p>
              </fn>
              <fn id="table8fn2">
                <p><sup>b</sup>OR: odds ratio.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this paper, 3 ML schemes were proposed to deal with missing, imbalanced, and sparse features in the process of developing sudden-death prediction models using emergency medicine data, which improves the performance of the developed model. To solve the problem of missing data, we propose an RF method to use real data to interpolate missing data. In the interpolation process, the consistency of the interpolation results is checked by determining the coefficient R<sup>2</sup> and the κ coefficient. From the interpolation results, the method shows the ability to correctly interpolate missing data. Imbalanced data are not conducive to obtaining accurate analysis results, and the model will be more inclined to predict new samples as patients with nonsudden death [<xref ref-type="bibr" rid="ref15">15</xref>]. In view of this phenomenon, we used the k-means algorithm to generate multiple data sets with different proportions of different categories by undersampling to evaluate the model. The method based on k-means can better preserve the patient's characteristic information. This method will not lose some representative patient samples due to random discarding, thus reducing the bias caused by sampling. The results show that the comprehensive performance of the model gradually improves as the data tend to balance (<xref rid="figure3" ref-type="fig">Figures 3</xref>-<xref rid="figure5" ref-type="fig">5</xref>). However, imbalance correction will weaken the calibration ability of the model and increase the calibration error. Data sparsity is also not conducive to modeling and analysis. When the samples are too sparse, the results of the classifier based on maximum-likelihood estimation will become unreliable, because there may be variable separation and multicollinearity [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. PC regression analysis is a method that uses PCA to extract the PC information about the original samples and uses PCs to replace the original variables for regression modeling [<xref ref-type="bibr" rid="ref39">39</xref>]. In our diagnostic data, the LR model using the original data showed the phenomenon of variable separation, which led to unreliable results and poor performance. The performance of the PC regression model has been improved. In addition, we can analyze the diagnosis significantly related to the sudden death of emergency patients from the results of PC regression. These diagnoses are consistent with previous findings [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
        <p>At present, there are many studies on the prediction of sudden death. Yu et al [<xref ref-type="bibr" rid="ref54">54</xref>] constructed an ML model to predict sudden cardiac death (SCD) in 15,661 patients with atherosclerosis. The results showed that the ML model performs better than the standard Poisson regression model and the AUROC of the ML model was 0.89. Karen et al [<xref ref-type="bibr" rid="ref56">56</xref>] trained an ML-based early warning model for identifying sudden infant death syndrome using the public data set “Lipidomic in sudden infant death syndrome.” The RF algorithm achieved an AUROC of 0.9 and a recall of 0.8. Ye et al [<xref ref-type="bibr" rid="ref5">5</xref>] selected a variety of ML algorithms to build an early real-time early warning system (EWS) to predict the death risk of emergency patients and carried out prospective validation. The results showed that the EWS could give an early warning within 40 hours before sudden death, and the AUROC reached 0.884. Bhattacharya et al [<xref ref-type="bibr" rid="ref57">57</xref>] used the electronic health records of 711 patients with hypertrophic myocardial cake and established an LR and naive Bayesian model with 22 variables, including statins, a family history of SCD, and left ventricular ejection fraction, to predict the risk of sudden death (ventricular fibrillation) in these patients. The sensitivity and specificity of the optimal model were 0.73 and 0.76, respectively, and the AUROC was 0.83. For our model, in the LR model constructed by using laboratory test data, the AUROC reached 0.895. After imbalance correction, the recall rate and AUPRC improved, reaching 0.863 and 0.897, respectively. Compared to the existing sudden-death prediction model based on ML, the performance of our model can achieve a similar effect, further indicating that our data-preprocessing methods can preserve the patient's characteristic information and improve the availability of emergency care.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This work also has some limitations. On the one hand, we only considered a single ML algorithm for data interpolation and did not discuss and compare the application of other possible ML algorithms in interpolation. It is possible that we overlooked the better performance of other methods. For example, for our data, due to the large proportion of missing and seriously imbalanced categorical variables, although we tried to adjust the relatively balanced data set to train the model, the κ coefficient improved to a certain extent but the effect was still poor. Therefore, a further discussion of ML methods that can handle a large number of missing and unbalanced categories or more reasonable feature processing may achieve better imputation results. Although imbalance correction can improve the sensitivity and specificity of the model, it can avoid biased errors of the model. However, this correction will also weaken the clinical application value of the model, lowering the calibration ability of the model and making it unable to accurately estimate the risk probability of patients. For the prediction model, the calibration ability of the model was not high, even on the original scale data set. Model calibration is another important characteristic of evaluating the clinical significance of prediction models. A well-calibrated model can provide more useful information for clinical decisions [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. We can further consider using isotonic regression [<xref ref-type="bibr" rid="ref60">60</xref>] to calibrate the model to improve its clinical application value. In addition, although the solution to deal with missing, imbalanced, and sparse features proposed by us is not the latest method, it is sufficient to solve the main data quality problems encountered in the development of prediction models for sudden death, which is reflected in the improvement of model performance and the consistency of the risk factors of sudden death obtained with the earlier literature results. In the future, we need to further explore the latest methods to solve these 3 data quality problems so as to extend the data-processing process to other data sets and provide a more reliable data source for prediction models. With regard to the construction of risk factor prediction models for patients with sudden death, we have a broad definition of sudden death, including patients who have undergone rescue or death events. These patients may include some nonemergency death cases, which may have a confusing effect on the final model. In addition, our feature selection was completely based on data, and only the remaining variables were trained in groups during the model training stage. This form can reduce the complexity of manually selecting features and also explore some potential risk variables. However, some clinically significant variables will also be discarded. Therefore, whether the model has clinical guiding significance remains to be further investigated. As a case study, we used LR as the main prediction model, which facilitated us to develop and analyze the risk factors of sudden death. However, the processing capacity of the LR model for nonlinear predictors is insufficient, resulting in insufficient performance of the developed model [<xref ref-type="bibr" rid="ref17">17</xref>]. This can be seen from the results of other ML models we additionally developed (the RF and GBM had the best performance, with an AUC of 0.936 and 0.931, respectively, which are better than LR models). Therefore, in the future, we will further optimize the data-preprocessing process and try to develop ML models with better performance to improve the clinical usability.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>Our work proposes to use ML methods to deal with data quality issues, such as missing data, data imbalance, and sparse features in emergency data, so as to improve data availability. In addition, the risk factors of sudden death in emergency patients are obtained from our model analysis. As a preliminary analysis result, this result is also the basis for the later use of ML algorithms to build the feature selection and data analysis of the prediction model of sudden death in emergency patients.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary Tables and Figures.</p>
        <media xlink:href="medinform_v11i1e38590_app1.docx" xlink:title="DOCX File , 278 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUPRC</term>
          <def>
            <p>area under the precision-recall curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AUROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ED</term>
          <def>
            <p>emergency department</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">EWS</term>
          <def>
            <p>early real-time early warning system</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">GBM</term>
          <def>
            <p>gradient boosting machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LASSO</term>
          <def>
            <p>least absolute shrinkage and selection operator</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">LDA</term>
          <def>
            <p>linear discriminant analysis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MAR</term>
          <def>
            <p>missing at random deletion</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">MCAR</term>
          <def>
            <p>missing completely at random</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">MNAR</term>
          <def>
            <p>not missing at random</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">OR</term>
          <def>
            <p>odds ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">PC</term>
          <def>
            <p>principal component</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">PCA</term>
          <def>
            <p>principal component analysis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">PR</term>
          <def>
            <p>precision-recall</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">ROC</term>
          <def>
            <p>receiver operating characteristic</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">SCD</term>
          <def>
            <p>sudden cardiac death</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb22">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to show their appreciation to the engineers working in the Information Centre (ICT) department of the Hainan Hospital of Chinese PLA General Hospital for their help with data preparation.</p>
      <p>The publication of this paper was funded by grants from the National Natural Science Foundation of China (no. 82102187) and the Hainan Natural Science Foundation Youth Fund (no. 620QN380).</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets used and analyzed during this study are available from the first author upon reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>XC carried out the methodological study and drafted the manuscript. HC collected and processed the data and drafted the manuscript. SN made the conceptual design and made critical revisions to the manuscript. XK reviewed the methodology and reviewed the manuscript. HD also reviewed the manuscript. HD conceptualized the study and performed a critical review.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nagamine</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gillette</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kahoun</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mayer</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Burghaus</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lippert</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Saxena</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Multiscale classification of heart failure phenotypes by unsupervised clustering of unstructured electronic medical record data</article-title>
          <source>Sci Rep</source>
          <year>2020</year>
          <month>12</month>
          <day>07</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>21340</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-020-77286-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-020-77286-6</pub-id>
          <pub-id pub-id-type="medline">33288774</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-020-77286-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC7721729</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fernandes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mendes</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Vieira</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Leite</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Palos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Finkelstein</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Horng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
          </person-group>
          <article-title>Risk of mortality and cardiopulmonary arrest in critical patients presenting to the emergency department using machine learning and natural language processing</article-title>
          <source>PLoS One</source>
          <year>2020</year>
          <volume>15</volume>
          <issue>4</issue>
          <fpage>203</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0230876"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0230876</pub-id>
          <pub-id pub-id-type="medline">32240233</pub-id>
          <pub-id pub-id-type="pii">PONE-D-19-27950</pub-id>
          <pub-id pub-id-type="pmcid">PMC7117713</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goulet</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Guerand</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Bloom</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Martel</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Aegerter</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Casalino</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Riou</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Freund</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Unexpected death within 72 hours of emergency department visit: were those deaths preventable?</article-title>
          <source>Crit Care</source>
          <year>2015</year>
          <month>04</month>
          <day>08</day>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>154</fpage>
          <lpage>164</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ccforum.biomedcentral.com/articles/10.1186/s13054-015-0877-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13054-015-0877-x</pub-id>
          <pub-id pub-id-type="medline">25887707</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13054-015-0877-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC4403754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>Reps</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Garibaldi</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Qureshi</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Can machine-learning improve cardiovascular risk prediction using routine clinical data?</article-title>
          <source>PLoS One</source>
          <year>2017</year>
          <volume>12</volume>
          <issue>4</issue>
          <fpage>174</fpage>
          <lpage>186</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0174944"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0174944</pub-id>
          <pub-id pub-id-type="medline">28376093</pub-id>
          <pub-id pub-id-type="pii">PONE-D-16-49429</pub-id>
          <pub-id pub-id-type="pmcid">PMC5380334</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ellrodt</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Brennan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Stearns</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sylvester</surname>
              <given-names>KG</given-names>
            </name>
            <name name-style="western">
              <surname>Widen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>McElhinney</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Ling</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A Real-Time Early Warning System for Monitoring Inpatient Mortality Risk: Prospective Study Using Electronic Medical Record Data</article-title>
          <source>J Med Internet Res</source>
          <year>2019</year>
          <month>07</month>
          <day>05</day>
          <volume>21</volume>
          <issue>7</issue>
          <fpage>137</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2019/7/e13719/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/13719</pub-id>
          <pub-id pub-id-type="medline">31278734</pub-id>
          <pub-id pub-id-type="pii">v21i7e13719</pub-id>
          <pub-id pub-id-type="pmcid">PMC6640073</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goldstein</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Navar</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Pencina</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ioannidis</surname>
              <given-names>JPA</given-names>
            </name>
          </person-group>
          <article-title>Opportunities and challenges in developing risk prediction models with electronic health records data: a systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2017</year>
          <month>01</month>
          <day>17</day>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>198</fpage>
          <lpage>208</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27189013"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocw042</pub-id>
          <pub-id pub-id-type="medline">27189013</pub-id>
          <pub-id pub-id-type="pii">ocw042</pub-id>
          <pub-id pub-id-type="pmcid">PMC5201180</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Srinivasan</surname>
              <given-names>NT</given-names>
            </name>
            <name name-style="western">
              <surname>Schilling</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Sudden cardiac death and arrhythmias</article-title>
          <source>Arrhythm Electrophysiol Rev</source>
          <year>2018</year>
          <month>6</month>
          <volume>7</volume>
          <issue>2</issue>
          <fpage>111</fpage>
          <lpage>117</lpage>
          <pub-id pub-id-type="doi">10.15420/aer.2018:15:2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Nanavati</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mehta</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mounsey</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nwosu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pursell</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>EH</given-names>
            </name>
            <name name-style="western">
              <surname>Mounsey</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Simpson</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Estimated incidence and risk factors of sudden unexpected death</article-title>
          <source>Open Heart</source>
          <year>2016</year>
          <month>03</month>
          <day>23</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>32</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openheart.bmj.com/lookup/pmidlookup?view=long&#38;pmid=27042316"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/openhrt-2015-000321</pub-id>
          <pub-id pub-id-type="medline">27042316</pub-id>
          <pub-id pub-id-type="pii">openhrt-2015-000321</pub-id>
          <pub-id pub-id-type="pmcid">PMC4809187</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adabag</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Luepker</surname>
              <given-names>RV</given-names>
            </name>
            <name name-style="western">
              <surname>Roger</surname>
              <given-names>VL</given-names>
            </name>
            <name name-style="western">
              <surname>Gersh</surname>
              <given-names>BJ</given-names>
            </name>
          </person-group>
          <article-title>Sudden cardiac death: epidemiology and risk factors</article-title>
          <source>Nat Rev Cardiol</source>
          <year>2010</year>
          <month>04</month>
          <day>9</day>
          <volume>7</volume>
          <issue>4</issue>
          <fpage>216</fpage>
          <lpage>225</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/20142817"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/nrcardio.2010.3</pub-id>
          <pub-id pub-id-type="medline">20142817</pub-id>
          <pub-id pub-id-type="pii">nrcardio.2010.3</pub-id>
          <pub-id pub-id-type="pmcid">PMC5014372</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Asadollahi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hastings</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Beeching</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gill</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Laboratory risk factors for hospital mortality in acutely admitted patients</article-title>
          <source>QJM</source>
          <year>2007</year>
          <month>08</month>
          <day>02</day>
          <volume>100</volume>
          <issue>8</issue>
          <fpage>501</fpage>
          <lpage>507</lpage>
          <pub-id pub-id-type="doi">10.1093/qjmed/hcm055</pub-id>
          <pub-id pub-id-type="medline">17609227</pub-id>
          <pub-id pub-id-type="pii">hcm055</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Du</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>Y-G</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K-F</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>An accurate predictor of liver failure and death after hepatectomy: a single institution's experience with 478 consecutive cases</article-title>
          <source>World J Gastroenterol</source>
          <year>2014</year>
          <month>01</month>
          <day>07</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>274</fpage>
          <lpage>281</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.wjgnet.com/1007-9327/full/v20/i1/274.htm"/>
          </comment>
          <pub-id pub-id-type="doi">10.3748/wjg.v20.i1.274</pub-id>
          <pub-id pub-id-type="medline">24415882</pub-id>
          <pub-id pub-id-type="pmcid">PMC3886019</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hernesniemi</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdiani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tynkkynen</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Lyytikäinen</surname>
              <given-names>L-P</given-names>
            </name>
            <name name-style="western">
              <surname>Mishra</surname>
              <given-names>PP</given-names>
            </name>
            <name name-style="western">
              <surname>Lehtimäki</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Eskola</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nikus</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Antila</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Oksala</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Extensive phenotype data and machine learning in prediction of mortality in acute coronary syndrome - the MADDEC study</article-title>
          <source>Ann Med</source>
          <year>2019</year>
          <month>03</month>
          <volume>51</volume>
          <issue>2</issue>
          <fpage>156</fpage>
          <lpage>163</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31030570"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/07853890.2019.1596302</pub-id>
          <pub-id pub-id-type="medline">31030570</pub-id>
          <pub-id pub-id-type="pmcid">PMC7857486</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Farahani</surname>
              <given-names>NZ</given-names>
            </name>
            <name name-style="western">
              <surname>Arunachalam</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Sundaram</surname>
              <given-names>DSB</given-names>
            </name>
            <name name-style="western">
              <surname>Pasupathy</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Enayati</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Arruda-Olson</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Explanatory analysis of a machine learning model to identify hypertrophic cardiomyopathy patients from EHR using diagnostic codes</article-title>
          <source>Proc IEEE Int Conf Bioinformatics Biomed</source>
          <year>2020</year>
          <month>12</month>
          <volume>2020</volume>
          <fpage>1932</fpage>
          <lpage>1937</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34316386"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/bibm49941.2020.9313231</pub-id>
          <pub-id pub-id-type="medline">34316386</pub-id>
          <pub-id pub-id-type="pmcid">PMC8313105</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wells</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chagin</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Nowacki</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Kattan</surname>
              <given-names>MW</given-names>
            </name>
          </person-group>
          <article-title>Strategies for handling missing data in electronic health record derived data</article-title>
          <source>EGEMS (Wash DC)</source>
          <year>2013</year>
          <month>12</month>
          <volume>1</volume>
          <issue>3</issue>
          <fpage>1035</fpage>
          <lpage>1043</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/25848578"/>
          </comment>
          <pub-id pub-id-type="doi">10.13063/2327-9214.1035</pub-id>
          <pub-id pub-id-type="medline">25848578</pub-id>
          <pub-id pub-id-type="pii">egems1035</pub-id>
          <pub-id pub-id-type="pmcid">PMC4371484</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H-K</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>C-T</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J-H</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>WS</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H-C</given-names>
            </name>
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>S-L</given-names>
            </name>
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>J-SR</given-names>
            </name>
          </person-group>
          <article-title>Early detecting in-hospital cardiac arrest based on machine learning on imbalanced data</article-title>
          <source>IEEE Int Conf Healthc Informatics</source>
          <year>2019</year>
          <fpage>1</fpage>
          <lpage>10</lpage>
          <pub-id pub-id-type="doi">10.1109/ICHI.2019.8904504</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>BX</given-names>
            </name>
            <name name-style="western">
              <surname>Japkowicz</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Boosting support vector machines for imbalanced data sets</article-title>
          <source>Knowl Inf Syst</source>
          <year>2009</year>
          <month>3</month>
          <day>5</day>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>20</lpage>
          <pub-id pub-id-type="doi">10.1007/s10115-009-0198-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heinze</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A comparative investigation of methods for logistic regression with separated or nearly separated data</article-title>
          <source>Stat Med</source>
          <year>2006</year>
          <month>12</month>
          <day>30</day>
          <volume>25</volume>
          <issue>24</issue>
          <fpage>4216</fpage>
          <lpage>4226</lpage>
          <pub-id pub-id-type="doi">10.1002/sim.2687</pub-id>
          <pub-id pub-id-type="medline">16955543</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>YM</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Principal component regression by principal component selection</article-title>
          <source>Commun Stat Appl Methods</source>
          <year>2015</year>
          <month>03</month>
          <day>31</day>
          <volume>22</volume>
          <issue>2</issue>
          <fpage>173</fpage>
          <lpage>180</lpage>
          <pub-id pub-id-type="doi">10.5351/csam.2015.22.2.173</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alvarez</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Halm</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Shannon</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Girod</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Amarasingham</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Predicting out of intensive care unit cardiopulmonary arrest or death using electronic medical record data</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2013</year>
          <month>02</month>
          <day>27</day>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>28</fpage>
          <lpage>35</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-13-28"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-13-28</pub-id>
          <pub-id pub-id-type="medline">23442316</pub-id>
          <pub-id pub-id-type="pii">1472-6947-13-28</pub-id>
          <pub-id pub-id-type="pmcid">PMC3599266</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seki</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kawazoe</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ohe</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Machine learning-based prediction of in-hospital mortality using admission laboratory data: a retrospective, single-site study using electronic health record data</article-title>
          <source>PLoS One</source>
          <year>2021</year>
          <volume>16</volume>
          <issue>2</issue>
          <fpage>246</fpage>
          <lpage>255</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0246640"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0246640</pub-id>
          <pub-id pub-id-type="medline">33544775</pub-id>
          <pub-id pub-id-type="pii">PONE-D-20-20161</pub-id>
          <pub-id pub-id-type="pmcid">PMC7864463</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chhabra</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Vashisht</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ranjan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A comparison of multiple imputation methods for data with missing values</article-title>
          <source>Indian J Sci Technol</source>
          <year>2017</year>
          <volume>10</volume>
          <issue>19</issue>
          <fpage>1</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.17485/ijst/2017/v10i19/110646</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Siah</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>CH</given-names>
            </name>
          </person-group>
          <article-title>Machine-learning models for predicting drug approvals and clinical-phase transitions</article-title>
          <source>SSRN Journal</source>
          <year>2017</year>
          <fpage>1</fpage>
          <lpage>60</lpage>
          <pub-id pub-id-type="doi">10.2139/ssrn.2973611</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Rajabi</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>A systematic review of machine learning-based missing value imputation techniques</article-title>
          <source>Data Technol Appl</source>
          <year>2021</year>
          <month>04</month>
          <day>02</day>
          <volume>55</volume>
          <issue>4</issue>
          <fpage>558</fpage>
          <lpage>585</lpage>
          <pub-id pub-id-type="doi">10.1108/dta-12-2020-0298</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jadhav</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pramod</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ramanathan</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Comparison of performance of data imputation methods for numeric dataset</article-title>
          <source>Appl Artif Intell</source>
          <year>2019</year>
          <month>07</month>
          <day>04</day>
          <volume>33</volume>
          <issue>10</issue>
          <fpage>913</fpage>
          <lpage>933</lpage>
          <pub-id pub-id-type="doi">10.1080/08839514.2019.1637138</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Enders</surname>
              <given-names>CK</given-names>
            </name>
          </person-group>
          <source>Applied Missing Data Analysis</source>
          <year>2010</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Guilford Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gimpy</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Missing value imputation in multi attribute data set</article-title>
          <source>Int J Comput Sci Inf Technol</source>
          <year>2014</year>
          <volume>5</volume>
          <issue>4</issue>
          <fpage>1</fpage>
          <lpage>7</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Dealing with missing values in data</article-title>
          <source>J Syst Integr</source>
          <year>2014</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>42</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="doi">10.20470/jsi.v5i1.178</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Helali</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Xue</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A hybrid GP-KNN imputation for symbolic regression with missing values</article-title>
          <year>2018</year>
          <conf-name>Australasian Joint Conference on Artificial Intelligence</conf-name>
          <conf-date>December 2018</conf-date>
          <conf-loc>Wellington, New Zealand</conf-loc>
          <fpage>345</fpage>
          <lpage>357</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-03991-2_33</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Janabi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Alkaim</surname>
              <given-names>AF</given-names>
            </name>
          </person-group>
          <article-title>A nifty collaborative analysis to predicting a novel tool (DRFLLS) for missing values estimation</article-title>
          <source>Soft Comput</source>
          <year>2019</year>
          <month>4</month>
          <day>11</day>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>555</fpage>
          <lpage>569</lpage>
          <pub-id pub-id-type="doi">10.1007/s00500-019-03972-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miyakoshi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kato</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A missing value imputation method using a Bayesian network with weighted learning</article-title>
          <source>Electron Commun Jpn</source>
          <year>2012</year>
          <month>11</month>
          <day>22</day>
          <volume>95</volume>
          <issue>12</issue>
          <fpage>1</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1002/ecj.11449</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
          </person-group>
          <article-title>Dealing with missing values</article-title>
          <source>Clinical Prediction Models</source>
          <year>2009</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>115</fpage>
          <lpage>137</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Austin</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>White</surname>
              <given-names>IR</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>van Buuren</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Missing data in clinical research: a tutorial on multiple imputation</article-title>
          <source>Can J Cardiol</source>
          <year>2021</year>
          <month>09</month>
          <volume>37</volume>
          <issue>9</issue>
          <fpage>1322</fpage>
          <lpage>1331</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0828-282X(20)31111-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.cjca.2020.11.010</pub-id>
          <pub-id pub-id-type="medline">33276049</pub-id>
          <pub-id pub-id-type="pii">S0828-282X(20)31111-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC8499698</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kokla</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Virtanen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kolehmainen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paananen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hanhineva</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Random forest-based imputation outperforms other methods for imputing LC-MS metabolomics data: a comparative study</article-title>
          <source>BMC Bioinformatics</source>
          <year>2019</year>
          <month>10</month>
          <day>11</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>492</fpage>
          <lpage>502</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3110-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-019-3110-0</pub-id>
          <pub-id pub-id-type="medline">31601178</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-019-3110-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6788053</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdelkhalek</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Ben Brahim</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Essousi</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>A new way of handling missing data in multi-source classification based on adaptive imputation</article-title>
          <year>2018</year>
          <conf-name>8th International Conference on Model and Data Engineering</conf-name>
          <conf-date>October 24-26, 2018</conf-date>
          <conf-loc>Marrakesh, Morocco</conf-loc>
          <fpage>125</fpage>
          <lpage>136</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-00856-7_8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tiwari</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Colborn</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Xing</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenberg</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Assessment of a machine learning model applied to harmonized electronic health record data for the prediction of incident atrial fibrillation</article-title>
          <source>JAMA Netw Open</source>
          <year>2020</year>
          <month>01</month>
          <day>03</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>36</fpage>
          <lpage>47</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31951272"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2019.19396</pub-id>
          <pub-id pub-id-type="medline">31951272</pub-id>
          <pub-id pub-id-type="pii">2758859</pub-id>
          <pub-id pub-id-type="pmcid">PMC6991266</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chawla</surname>
              <given-names>NV</given-names>
            </name>
            <name name-style="western">
              <surname>Bowyer</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>LO</given-names>
            </name>
            <name name-style="western">
              <surname>Kegelmeyer</surname>
              <given-names>WP</given-names>
            </name>
          </person-group>
          <article-title>SMOTE: synthetic minority over-sampling technique</article-title>
          <source>J Artif Intell Res</source>
          <year>2002</year>
          <month>06</month>
          <day>01</day>
          <volume>16</volume>
          <fpage>321</fpage>
          <lpage>357</lpage>
          <pub-id pub-id-type="doi">10.1613/jair.953</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Segura-Bedmar</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Colón-Ruíz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tejedor-Alonso</surname>
              <given-names>MÁ</given-names>
            </name>
            <name name-style="western">
              <surname>Moro-Moro</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Predicting of anaphylaxis in big data EMR by exploring machine learning approaches</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>11</month>
          <volume>87</volume>
          <fpage>50</fpage>
          <lpage>59</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(18)30187-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.09.012</pub-id>
          <pub-id pub-id-type="medline">30266231</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30187-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jhang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Clustering-based undersampling in class-imbalanced data</article-title>
          <source>Inf Sci</source>
          <year>2017</year>
          <month>10</month>
          <volume>409-410</volume>
          <fpage>17</fpage>
          <lpage>26</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ins.2017.05.008</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ringnér</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>What is principal component analysis?</article-title>
          <source>Nat Biotechnol</source>
          <year>2008</year>
          <month>03</month>
          <volume>26</volume>
          <issue>3</issue>
          <fpage>303</fpage>
          <lpage>304</lpage>
          <pub-id pub-id-type="doi">10.1038/nbt0308-303</pub-id>
          <pub-id pub-id-type="medline">18327243</pub-id>
          <pub-id pub-id-type="pii">nbt0308-303</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Firth</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Bias reduction of maximum likelihood estimates</article-title>
          <source>Biometrika</source>
          <year>1993</year>
          <volume>80</volume>
          <issue>1</issue>
          <fpage>27</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.1093/biomet/80.1.27</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Varshavsky</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gottlieb</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Linial</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Horn</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Novel unsupervised feature filtering of biological data</article-title>
          <source>Bioinformatics</source>
          <year>2006</year>
          <month>07</month>
          <day>15</day>
          <volume>22</volume>
          <issue>14</issue>
          <fpage>507</fpage>
          <lpage>513</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btl214</pub-id>
          <pub-id pub-id-type="medline">16873514</pub-id>
          <pub-id pub-id-type="pii">22/14/e507</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Anowar</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sadaoui</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Selim</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Conceptual and empirical comparison of dimensionality reduction algorithms (PCA, KPCA, LDA, MDS, SVD, LLE, ISOMAP, LE, ICA, t-SNE)</article-title>
          <source>Comput Sci Rev</source>
          <year>2021</year>
          <month>05</month>
          <volume>40</volume>
          <fpage>100</fpage>
          <lpage>110</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.cosrev.2021.100378"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.cosrev.2021.100378</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Banerjee</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pal</surname>
              <given-names>NR</given-names>
            </name>
          </person-group>
          <article-title>Feature selection with SVD entropy: some modification and extension</article-title>
          <source>Inf Sci</source>
          <year>2014</year>
          <month>04</month>
          <volume>264</volume>
          <fpage>118</fpage>
          <lpage>134</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ins.2013.12.029</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kalankesh</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Weatherall</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ba-Dhfari</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Buchan</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Brass</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Taming EHR data: using semantic similarity to reduce dimensionality</article-title>
          <source>Stud Health Technol Informno</source>
          <year>2013</year>
          <volume>192</volume>
          <issue>1-2</issue>
          <fpage>52</fpage>
          <lpage>56</lpage>
          <pub-id pub-id-type="doi">10.3233/978-1-61499-289-9-52</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Investigation of heart lipid changes in acute β-AR activation-induced sudden cardiac death by time-of-flight secondary ion mass spectrometry</article-title>
          <source>Analyst</source>
          <year>2020</year>
          <month>08</month>
          <day>24</day>
          <volume>145</volume>
          <issue>17</issue>
          <fpage>5889</fpage>
          <lpage>5896</lpage>
          <pub-id pub-id-type="doi">10.1039/d0an00768d</pub-id>
          <pub-id pub-id-type="medline">32662451</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Application of principal component analysis and logistic regression model in lupus nephritis patients with clinical hypothyroidism</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2020</year>
          <month>05</month>
          <day>01</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>99</fpage>
          <lpage>110</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-020-00989-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12874-020-00989-x</pub-id>
          <pub-id pub-id-type="medline">32357838</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12874-020-00989-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC7195728</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="web">
          <source>Emergency Data Processing</source>
          <year>2022</year>
          <month>9</month>
          <day>20</day>
          <access-date>2022-09-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/jacksoncoki/Emergency-data-processing">https://github.com/jacksoncoki/Emergency-data-processing</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
          </person-group>
          <article-title>Validation of prediction models</article-title>
          <source>Clinical Prediction Models</source>
          <year>2019</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>329</fpage>
          <lpage>344</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Geeven</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>van Kesteren</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Smit</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>de Gunst</surname>
              <given-names>MCM</given-names>
            </name>
          </person-group>
          <article-title>Identification of context-specific gene regulatory networks with GEMULA--gene expression modeling using LAsso</article-title>
          <source>Bioinformatics</source>
          <year>2012</year>
          <month>01</month>
          <day>15</day>
          <volume>28</volume>
          <issue>2</issue>
          <fpage>214</fpage>
          <lpage>221</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btr641</pub-id>
          <pub-id pub-id-type="medline">22106333</pub-id>
          <pub-id pub-id-type="pii">btr641</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Breiman</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <source>Random Forests--Random Features: Technical Report 567</source>
          <year>1999</year>
          <month>9</month>
          <access-date>2022-12-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.stat.berkeley.edu/~breiman/random-forests.pdf">https://www.stat.berkeley.edu/~breiman/random-forests.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ridgeway</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Generalized boosted models: a guide to the gbm package</article-title>
          <source>Update</source>
          <year>2007</year>
          <month>8</month>
          <day>3</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>2007</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jakkula</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Tutorial on Support Vector Machine</article-title>
          <source>Tutorial on Support Vector Machine (SVM)</source>
          <year>2006</year>
          <fpage>100</fpage>
          <lpage>111</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://course.ccs.neu.edu/cs5100f11/resources/jakkula.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/springerreference_106815</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Regression shrinkage and selection via the lasso</article-title>
          <source>J R Stat Soc: B (Methodol)</source>
          <year>2018</year>
          <month>12</month>
          <day>05</day>
          <volume>58</volume>
          <issue>1</issue>
          <fpage>267</fpage>
          <lpage>288</lpage>
          <pub-id pub-id-type="doi">10.1111/j.2517-6161.1996.tb02080.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wongvibulsin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Daya</surname>
              <given-names>NR</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Matsushita</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Coresh</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zeger</surname>
              <given-names>SL</given-names>
            </name>
          </person-group>
          <article-title>Machine learning for sudden cardiac death prediction in the atherosclerosis risk in communities study</article-title>
          <source>medRxiv.</source>
          <comment>Preprint posted online January 16, 2022</comment>
          <pub-id pub-id-type="doi">10.1101/2022.01.12.22269174</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>LJ</given-names>
            </name>
          </person-group>
          <article-title>Principal component analysis</article-title>
          <source>WIREs Comp Stat</source>
          <year>2010</year>
          <month>06</month>
          <day>30</day>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>433</fpage>
          <lpage>459</lpage>
          <pub-id pub-id-type="doi">10.1002/wics.101</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Villagrana-Bañuelos</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Galván-Tejada</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Galván-Tejada</surname>
              <given-names>JI</given-names>
            </name>
            <name name-style="western">
              <surname>Gamboa-Rosales</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Celaya-Padilla</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Soto-Murillo</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Solís-Robles</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Machine learning model based on lipidomic profile information to predict sudden infant death syndrome</article-title>
          <source>Healthcare (Basel)</source>
          <year>2022</year>
          <month>07</month>
          <day>14</day>
          <volume>10</volume>
          <issue>7</issue>
          <fpage>1303</fpage>
          <lpage>1318</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare10071303"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare10071303</pub-id>
          <pub-id pub-id-type="medline">35885829</pub-id>
          <pub-id pub-id-type="pii">healthcare10071303</pub-id>
          <pub-id pub-id-type="pmcid">PMC9317003</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhattacharya</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kudchadkar</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Greenland</surname>
              <given-names>GV</given-names>
            </name>
            <name name-style="western">
              <surname>Lingamaneni</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Corona-Villalobos</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Guan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Marine</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Olgin</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmerman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Abraham</surname>
              <given-names>TP</given-names>
            </name>
            <name name-style="western">
              <surname>Shatkay</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Abraham</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Identifying ventricular arrhythmias and their predictors by applying machine learning methods to electronic health records in patients with hypertrophic cardiomyopathy (HCM-VAr-risk model)</article-title>
          <source>Am J Cardiol</source>
          <year>2019</year>
          <month>05</month>
          <day>15</day>
          <volume>123</volume>
          <issue>10</issue>
          <fpage>1681</fpage>
          <lpage>1689</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0002-9149(19)30227-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.amjcard.2019.02.022</pub-id>
          <pub-id pub-id-type="medline">30952382</pub-id>
          <pub-id pub-id-type="pii">S0002-9149(19)30227-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alba</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Agoritsas</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Walsh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hanna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Iorio</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Devereaux</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>McGinn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guyatt</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Discrimination and calibration of clinical prediction models: users' guides to the medical literature</article-title>
          <source>JAMA</source>
          <year>2017</year>
          <month>10</month>
          <day>10</day>
          <volume>318</volume>
          <issue>14</issue>
          <fpage>1377</fpage>
          <lpage>1384</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2017.12126</pub-id>
          <pub-id pub-id-type="medline">29049590</pub-id>
          <pub-id pub-id-type="pii">2656816</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Reitsma</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Moons</surname>
              <given-names>KGM</given-names>
            </name>
          </person-group>
          <article-title>Transparent reporting of a multivariable prediction model for individual prognosis or diagnosis (TRIPOD): the TRIPOD statement</article-title>
          <source>Br J Surg</source>
          <year>2015</year>
          <month>02</month>
          <volume>102</volume>
          <issue>3</issue>
          <fpage>148</fpage>
          <lpage>158</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://core.ac.uk/reader/81583807?utm_source=linkout"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/bjs.9736</pub-id>
          <pub-id pub-id-type="medline">25627261</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Robertson</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>FT</given-names>
            </name>
          </person-group>
          <article-title>Consistency in generalized isotonic regression</article-title>
          <source>Ann Stat</source>
          <year>1975</year>
          <month>3</month>
          <day>1</day>
          <volume>3</volume>
          <issue>2</issue>
          <fpage>350</fpage>
          <lpage>362</lpage>
          <pub-id pub-id-type="doi">10.1214/aos/1176343061</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
