<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="review-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn></journal-meta><article-meta><article-id pub-id-type="publisher-id">50117</article-id><article-id pub-id-type="doi">10.2196/50117</article-id><title-group><article-title>Machine Learning Models for Parkinson Disease: Systematic Review</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Tabashum</surname><given-names>Thasina</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Snyder</surname><given-names>Robert Cooper</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>O'Brien</surname><given-names>Megan K</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Albert</surname><given-names>Mark V</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Computer Science and Engineering, University of North Texas</institution>, <addr-line>Denton</addr-line><addr-line>TX</addr-line>, <country>United States</country></aff><aff id="aff2"><institution>Technology and Innovation Hub, Shirley Ryan AbilityLab</institution>, <addr-line>Chicago</addr-line><addr-line>IL</addr-line>, <country>United States</country></aff><aff id="aff3"><institution>Department of Physical Medicine &#x0026; Rehabilitation, Northwestern University</institution>, <addr-line>Chicago</addr-line><addr-line>IL</addr-line>, <country>United States</country></aff><aff id="aff4"><institution>Department of Biomedical Engineering, University of North Texas</institution>, <addr-line>Denton</addr-line><addr-line>TX</addr-line>, <country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wong</surname><given-names>Jenna</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Marceglia</surname><given-names>Sara</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kanike</surname><given-names>Uday</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Thasina Tabashum, MSc<email>thasinatabashumabonti@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>17</day><month>5</month><year>2024</year></pub-date><volume>12</volume><elocation-id>e50117</elocation-id><history><date date-type="received"><day>19</day><month>06</month><year>2023</year></date><date date-type="rev-recd"><day>12</day><month>02</month><year>2024</year></date><date date-type="accepted"><day>01</day><month>04</month><year>2024</year></date></history><copyright-statement>&#x00A9; Thasina Tabashum, Robert Cooper Snyder, Megan K O'Brien, Mark V Albert. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 17.5.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2024/1/e50117"/><abstract><sec><title>Background</title><p>With the increasing availability of data, computing resources, and easier-to-use software libraries, machine learning (ML) is increasingly used in disease detection and prediction, including for Parkinson disease (PD). Despite the large number of studies published every year, very few ML systems have been adopted for real-world use. In particular, a lack of external validity may result in poor performance of these systems in clinical practice. Additional methodological issues in ML design and reporting can also hinder clinical adoption, even for applications that would benefit from such data-driven systems.</p></sec><sec><title>Objective</title><p>To sample the current ML practices in PD applications, we conducted a systematic review of studies published in 2020 and 2021 that used ML models to diagnose PD or track PD progression.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted a systematic literature review in accordance with PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines in PubMed between January 2020 and April 2021, using the following exact string: &#x201C;Parkinson&#x2019;s&#x201D; AND (&#x201C;ML&#x201D; OR &#x201C;prediction&#x201D; OR &#x201C;classification&#x201D; OR &#x201C;detection&#x201D; or &#x201C;artificial intelligence&#x201D; OR &#x201C;AI&#x201D;). The search resulted in 1085 publications. After a search query and review, we found 113 publications that used ML for the classification or regression-based prediction of PD or PD-related symptoms.</p></sec><sec sec-type="results"><title>Results</title><p>Only 65.5% (74/113) of studies used a holdout test set to avoid potentially inflated accuracies, and approximately half (25/46, 54%) of the studies without a holdout test set did not state this as a potential concern. Surprisingly, 38.9% (44/113) of studies did not report on how or if models were tuned, and an additional 27.4% (31/113) used ad hoc model tuning, which is generally frowned upon in ML model optimization. Only 15% (17/113) of studies performed direct comparisons of results with other models, severely limiting the interpretation of results.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This review highlights the notable limitations of current ML systems and techniques that may contribute to a gap between reported performance in research and the real-life applicability of ML models aiming to detect and predict diseases such as PD.</p></sec></abstract><kwd-group><kwd>Parkinson disease</kwd><kwd>machine learning</kwd><kwd>systematic review</kwd><kwd>deep learning</kwd><kwd>clinical adoption</kwd><kwd>validation techniques</kwd><kwd>PRISMA</kwd><kwd>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Parkinson disease (PD) is a progressive neurodegenerative disease that results in a loss of motor function with muscle weakness, tremors, and rigidity. Secondary symptoms include speech difficulties, sleep disorders, and cognitive changes. Research suggests that pathophysiological symptoms can be used to detect PD before the onset of the motor features [<xref ref-type="bibr" rid="ref1">1</xref>]. For these reasons, multiple clinical assessments and analyses are required to diagnose PD and allow for early detection. However, clinical diagnosis of PD is an error-prone process [<xref ref-type="bibr" rid="ref2">2</xref>]. A UK autopsy study found that the misdiagnosis rate of PD is 24% [<xref ref-type="bibr" rid="ref3">3</xref>]. Early detection is especially important for PD since early neuroprotective treatment slows down the progression of the disease and lessens the symptoms, which improves the patient&#x2019;s quality of life [<xref ref-type="bibr" rid="ref4">4</xref>]. From diagnosis to treatment, each case of PD is unique [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Precision medicine using machine learning (ML) has the potential to better use the varied data of individuals. Therefore, ML-based solutions can play an important role in PD diagnosis [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Here, ML refers to the branch of artificial intelligence that uses computational methods to perform a specific task without being explicitly programmed, by learning from previous examples of data and making predictions about new data [<xref ref-type="bibr" rid="ref9">9</xref>]. ML includes a broad range of standard learning algorithms, such as decision trees, support vector machines, and linear or logistic regression, as well as the subfield of deep learning that uses sophisticated, biologically inspired learning algorithms called neural networks. Generally, supervised algorithms learn from labeled data (eg, classification or regression), whereas unsupervised algorithms learn from hidden patterns in the unlabeled data (eg, clustering).</p><p>In the medical field, ML is becoming an increasingly central technique. For example, ML-based prediction models are being developed to detect early signs of diseases, improve decision-making processes, and track rehabilitation efficacy. Fueled by advances in data-recording technology, the increasing availability of patient data, and more accessible databases and code libraries, these models can generate more accurate insights about patients from large, existing health data sets. Contreras and Vehi [<xref ref-type="bibr" rid="ref10">10</xref>] showed that within a decade, the number of articles proposing artificial intelligence models in diabetes research grew by 500%. Despite the large number of promising studies reported in the literature, the adoption of ML models in real-life clinical practice is low [<xref ref-type="bibr" rid="ref11">11</xref>]. A wide range of ML models have been proposed for the automatic detection of PD [<xref ref-type="bibr" rid="ref12">12</xref>]. Searching with only 1 query related to ML and PD results in over 1000 publications in 1 year alone. Despite the rising popularity of ML in PD research, models are rarely deployed in the field due to their irreproducibility and are limited for research purposes [<xref ref-type="bibr" rid="ref13">13</xref>]. Although there may be many explanations, one possibility is a disconnect between the models developed in research and real-life implementation.</p><p>In contrast to previous systematic reviews that primarily explored data types and model variations, the emphasis of this review lies in the critical context of model validation approaches to provide a comprehensive understanding of the strengths and limitations of ML models in the PD field. Previous reviews emphasized data types; for instance, Ramdhani et al [<xref ref-type="bibr" rid="ref14">14</xref>] reviewed sensor-based ML algorithms for PD predictions, and Mei et al [<xref ref-type="bibr" rid="ref15">15</xref>] provided a comprehensive overview of outcomes associated with the type and source of data for 209 studies that applied ML models for PD diagnosis. Mei et al [<xref ref-type="bibr" rid="ref15">15</xref>] also noted concerns about insufficient descriptions of methods, results, and validation techniques. We focused on the critical evaluation of validation techniques that are instrumental for the clinical integration of ML.</p><p>In this review, we examined a cross-section of recent ML prediction models related to PD detection and progression. Our goal was to summarize the different ML practices in PD research and identify areas for improvement related to model design, training, validation, tuning, and evaluation. Implementing best ML practices would help researchers develop PD prediction models that are more reproducible and generalizable, which in turn would improve their impact on the entire landscape of patient care and outcomes.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Search Strategy</title><p>We conducted a systematic literature review in accordance with PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses; <xref ref-type="supplementary-material" rid="app4">Checklist 1</xref>) guidelines in PubMed between January 2020 and April 2021, using the following exact string: &#x201C;Parkinson&#x2019;s&#x201D; AND (&#x201C;ML&#x201D; OR &#x201C;prediction&#x201D; OR &#x201C;classification&#x201D; OR &#x201C;detection&#x201D; or &#x201C;artificial intelligence&#x201D; OR &#x201C;AI&#x201D;). The search resulted in 1085 publications.</p></sec><sec id="s2-2"><title>Inclusion and Exclusion</title><p>Inclusion criteria were studies (1) on ML applied for predicting PD, PD subscores or PD severity, and PD symptoms; (2) published between January 2020 and April 2021; (3) written in English; and (4) with an available title and abstract.</p></sec><sec id="s2-3"><title>Questionnaire Design</title><p>We designed a customized questionnaire to easily parse the literature and extract characteristics of the different ML approaches. <xref ref-type="other" rid="box1">Textbox 1</xref> summarizes the model details extracted from the questionnaire, and the exact questionnaire is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. This questionnaire was not intended to extract exhaustive details about these models, but rather to target specific concepts that seem to be inconsistently reported in the PD modeling literature. Our rationale for each question, and how they were designed specifically for PD, is provided below.</p><p>PD is a progressive neurological disorder, and symptoms can vary widely for each individual. To categorize PD progression and assess patient status, clinicians use standardized metrics such as the Unified Parkinson&#x2019;s Disease Rating Scale [<xref ref-type="bibr" rid="ref16">16</xref>] and Hoehn and Yahr (H&#x0026;Y) scores [<xref ref-type="bibr" rid="ref17">17</xref>]. The first question is related to clearly defining the research objectives or target outcomes of a particular study. The challenge of classifying PD versus non-PD may depend on symptom severity, which can be more readily assessed when severity metrics are available. In certain stages of PD, symptoms can be controlled or lessened through careful medication regimens, such as levodopa. This medication&#x2019;s <italic>on</italic> and <italic>off</italic> periods are essential components for clinicians and researchers to consider. <italic>On</italic> and <italic>off</italic> episodes can create a substantially different effect on symptoms [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], and these symptoms are being used in ML algorithms to classify or assess PD. For example, Jahanshahi et al [<xref ref-type="bibr" rid="ref20">20</xref>] investigated the levodopa medication&#x2019;s effect on PD probabilistic classification learning and demonstrated that learning is associated with the patient with PD being in an <italic>on</italic> or <italic>off</italic> state. Warmerdam et al [<xref ref-type="bibr" rid="ref21">21</xref>] showed that the patient&#x2019;s state relative to dopaminergic medication correlated with the arm-swing task during PD walking. PD characteristics are important while researching PD, and the application of the models might play different roles depending on the data. As a result, the questions regarding the severity and medication state of patients can play a crucial role. In addition, class imbalance, cross-validation techniques, and hyperparameter tuning are critical concepts in ML. Class imbalance can lead to biased models or misinterpretation of results. Cross-validation and hyperparameter tuning allow systematic exploring of models and are essential for assessing models&#x2019; generalization performance. Lastly, comparing model performance to benchmark data can be valuable for research goals, but this process is not always applicable or possible.</p><boxed-text id="box1"><title> Model details obtained during data extraction (n=113).</title><list list-type="order"><list-item><p>What have the authors classified using machine learning?</p></list-item><list-item><p>Was there any information about the participants being on or off medication prior to the experiment?</p></list-item><list-item><p>Of the study participants, how many were (1) individuals with Parkinson disease, (2) controls, and (3) individuals with other diseases?</p></list-item><list-item><p>Did the study mention the distribution of the Unified Parkinson&#x2019;s Disease Rating Scale and Hoehn and Yahr scores?</p></list-item><list-item><p>What class imbalance mitigation techniques did the authors perform?</p></list-item><list-item><p>How did the authors split or cross-validate the data set while training the model? If cross-validation was applied, which particular strategies were applied?</p></list-item><list-item><p>If applicable, have the authors made the reader aware of the potential overinflated performance results (eg, the model overfitting the training data)? If so, how?</p></list-item><list-item><p>How was the hyperparameter tuning done?</p></list-item><list-item><p>Did the authors analyze and discuss the models&#x2019; errors or misclassifications?</p></list-item><list-item><p>How did they compare their model to other modeling approaches by themselves or other authors, directly or indirectly?</p></list-item><list-item><p>Did the authors use multiple evaluation metrics to measure the performance of the model(s)?</p></list-item></list></boxed-text></sec><sec id="s2-4"><title>Data Extraction</title><p>Two authors assessed the inclusion criteria of 1085 studies based on the title and abstract. During the initial manual screening of the title and abstract, 155 studies that met the initial inclusion criteria were identified. A total of 42 studies were excluded after assessing the full text for eligibility. These authors also extracted data from the studies using the questionnaire described above. Ultimately, 113 studies and the corresponding questionnaire responses were rechecked independently by both reviewers, and disagreements were resolved through discussion to reach a consensus. Questionnaire data from each study are provided in in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><p>For the multiple-choice and checkbox questions (ie, questions 1, 7, 8, 9, 10, 11, 13, 14, and 15), we counted the number of times each response occurred in the results.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>First, we provide a general overview of the study characteristics in each publication. Then, we examine specific results evaluating the ML modeling practices using the following categories: PD characteristics, class imbalance, data set splitting, overfitting, hyperparameter tuning, and model comparisons.</p><sec id="s3-1"><title>General Overview of Studies</title><sec id="s3-1-1"><title>Methods Applied</title><p>The most prevalent ML classification algorithms were support vector machines (53/113, 46.9%), boosting ensemble learning (48/113, 42.5%; eg, gradient boosting, extreme gradient boosting, and random forest), naive Bayes (4/113, 3.5%), decision tree (13/113, 11.5%), and <italic>k</italic>-nearest neighbor (22/113, 19.5%). In regression models, the most prevalent methods included multiple linear or logistic regression (32/113, 28.3%), regression trees, <italic>k</italic>-means clustering, and Bayesian regression (3/113, 2.6%). Deep learning methods included convolutional neural networks (10/113, 8.8%), variants of recurrent neural networks (4/113, 3.5%; eg, long short-term memory [LSTM] and bidirectional-LSTM), and fully connected neural networks (22/113, 19.5%).</p></sec><sec id="s3-1-2"><title>Data Modalities and Sources</title><p>More than half of the studies (65/113, 57.5%) used data collected by the authors, whereas 38.9% (44/113) used a public data set and 3.6% (4/113) used a mixture of public and private data sets. The most common data modalities were magnetic resonance imaging, single-photon emission computerized tomography imaging, voice recordings or features, gait movements, handwriting movements, surveys, and cerebrospinal fluid features.</p></sec></sec><sec id="s3-2"><title>ML Modeling Practices</title><sec id="s3-2-1"><title>PD Prediction Target</title><p>We categorized the studies based on 5 ML outcomes for PD models: <italic>PD versus non-PD classification</italic>, <italic>PD severity prediction</italic>, <italic>PD versus non-PD versus other diseases classification</italic>, <italic>PD symptoms quantification</italic>, and <italic>PD progression prediction</italic>. A total of 10 studies fell into more than 1 category; among them, 8 (80%) studies examined both <italic>PD versus non-PD classification</italic> and <italic>PD severity regression</italic>, and 2 (20%) studies examined <italic>PD versus non-PD classification</italic> and <italic>PD symptoms quantification</italic>.</p><list list-type="order"><list-item><p><italic>PD versus non-PD classification</italic> (59/113, 52.2%): studies that proposed ML methods to distinguish between individuals with PD from controls without PD</p></list-item><list-item><p><italic>PD severity prediction</italic> (30/113, 26.5%): studies that proposed ML methods to predict the stages of Unified Parkinson&#x2019;s Disease Rating Scale scores or H&#x0026;Y scores of PD</p></list-item><list-item><p><italic>PD versus non-PD versus other diseases classification</italic> (24/113, 21.2%): studies that proposed ML methods to distinguish between PD, non-PD, and other diseases (eg, Alzheimer disease)</p></list-item><list-item><p><italic>PD symptoms quantification</italic> (9/113, 8%): studies that proposed ML methods to distinguish between PD symptoms (eg, tremor and bradykinesia) from no symptoms or non-PD symptoms</p></list-item><list-item><p><italic>PD progression prediction</italic> (1/113, 0.9%): studies that proposed ML methods to predict PD progression</p></list-item></list><p><italic>PD versus non-PD classification</italic> and <italic>PD versus non-PD versus other diseases classification</italic> have target settings that are binary variable predictions, as these targets are mostly for predicting the presence or absence of PD. <italic>PD severity prediction</italic> can be categorical (multilabel classification) or continuous (regression), such as predicting the H&#x0026;Y score. <italic>PD symptoms quantification</italic> can also be categorical, such as predicting the presence of resting tremors, rigidity, and bradykinesia, or continuous, such as predicting the degree of tremor intensity. <italic>PD progression prediction</italic> measures the changes in overall disease severity at multiple time points. We found that most studies (107/113, 94.6%) indicated PD severity. However, fewer than half (53/113, 46.9%) of the studies reported the patient medication status directly, with 38.9% (44/113) using public data sets.</p></sec><sec id="s3-2-2"><title>Class Imbalance</title><p>Class imbalance occurs when 1 training class contains significantly fewer samples than another class. In this case, the learners tend to focus on the better performance of the majority group, making it difficult to interpret the evaluation metrics, such as accuracy, for groups with less representation. Prediction models can be significantly affected by the imbalance problem. ML models can be highly unstable with different imbalance ratios [<xref ref-type="bibr" rid="ref22">22</xref>]. On predicting <italic>PD versus non-PD classification</italic>, performance can suffer significantly from an imbalanced data set and generate impaired results [<xref ref-type="bibr" rid="ref23">23</xref>]. Class imbalance can impact model external validity, and either mitigating or at least reporting the potential concerns in the interpretability of outcomes due to imbalances would help the reader interpret the model&#x2019;s power for predicting each class.</p><p>There are multiple ways to handle a class imbalance in the training phase, such as using resampling techniques or weighted evaluation metrics. Resampling creates a more balanced training data set, such as by oversampling the minority class or undersampling the majority class [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Moreover, there are alternative evaluation metrics, for example, balanced accuracy and <italic>F</italic>-measure, but these improvements on the standard evaluation metrics are also affected by class imbalance [<xref ref-type="bibr" rid="ref26">26</xref>]. We observed that among the studies that attempted to mitigate class imbalance, many of them adopted under- or oversampling methods and then applied class weights to the evaluation metrics. Other techniques were data augmentation and grouping data to use the same ratio of minority and majority classes. In the case of extreme class imbalance, Megahed et al [<xref ref-type="bibr" rid="ref27">27</xref>] were not able to mitigate overfitting. Overall, there is no perfect solution to tackle this critical issue in ML; however, recognizing that the problem exists and investigating appropriate mitigation strategies should be standard practice. Our results found at least moderate class imbalance in more than two-thirds (77/113, 68.1%) of the studies, and only 18% (5/27), 31% (5/16), 27% (8/30), and 25% (1/4) of studies for the <italic>PD versus non-PD classification</italic>, <italic>PD versus non-PD versus other diseases classification</italic>, <italic>PD severity prediction</italic>, and <italic>PD symptoms quantification and progression prediction</italic> target categories applied strategies to mitigate the effects of class imbalance, respectively. In <xref ref-type="fig" rid="figure1">Figure 1</xref>, we illustrate the number of studies with more than 30% class imbalance and how many of them applied imbalance mitigation strategies.</p><p>In some cases, authors applied class imbalance strategies but found no significant improvement in their model performance. Reporting these cases still provides valuable perspectives. For instance, van den Goorbergh et al [<xref ref-type="bibr" rid="ref28">28</xref>] illustrated that correcting for imbalance resulted in the model exhibiting strong miscalibration and did not improve the model&#x2019;s capability to distinguish between patients and controls. A total of 4 studies compared results when using imbalanced data compared to imbalance-mitigated data. Details of these studies are provided in <xref ref-type="table" rid="table1">Table 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Number of studies with more than 30% class imbalance and the percentage of studies that applied the class imbalance strategies, separated by PD prediction target. In the <italic>PD versus non-PD classification</italic>, <italic>PD versus non-PD versus other diseases classification</italic>, <italic>PD severity prediction</italic>, and <italic>PD symptoms quantification and progression prediction</italic> categories, 46% (27/59), 67% (16/24), 100% (30/30), and 40% (4/10) had class imbalance, but only 8% (5/59), 21% (8/30), 27% (8/30), and 10% (1/10) applied mitigation strategies, respectively. PD: Parkinson disease.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e50117_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison between imbalanced data versus imbalance mitigation strategies.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Studies</td><td align="left" valign="bottom">Participant distribution</td><td align="left" valign="bottom">Techniques</td><td align="left" valign="bottom">Conclusion</td></tr></thead><tbody><tr><td align="left" valign="top">Moon et al [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">524 patients with PD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> and 43 patients with essential tremor</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>SMOTE<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p><italic>F</italic><sub>1</sub>-score improved</p></list-item></list></td></tr><tr><td align="left" valign="top">Veeraragavan et al [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">93 patients with idiopathic PD and 73 controls; 10 patients with H&#x0026;Y<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> 3; 28 patients with H&#x0026;Y 2.5; and 55 patients with H&#x0026;Y 2</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>SMOTE</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Test accuracy improved</p></list-item></list></td></tr><tr><td align="left" valign="top">Falchetti et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">388 patients with idiopathic PD and 323 controls</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Oversampling</p></list-item><list-item><p>Undersampling</p></list-item><list-item><p>Combination of oversampling and undersampling</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Without any sampling, the combination of oversampling and undersampling methods is comparable</p></list-item></list></td></tr><tr><td align="left" valign="top">Jeancolas et al [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">115 patients with PD and 152 controls</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Data augmentation</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Performed better for free speech task</p></list-item><list-item><p>No consistent improvement in the sentence repetition task</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>PD: Parkinson disease.</p></fn><fn id="table1fn2"><p><sup>b</sup>SMOTE: synthetic minority oversampling technique.</p></fn><fn id="table1fn3"><p><sup>c</sup>H&#x0026;Y: Hoehn and Yahr.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2-3"><title>Data Set Splitting</title><p>It is universally acknowledged that ML models can perform arbitrarily well on data that were used to create the model&#x2014;that is, the training data set. This is why standard procedure in training models uses separate data sets to try different model variations and select the better variants. The confusion that sometimes occurs is when these separate data sets are used to select from a large number of model variants (validation set) or only used for the evaluation of selected variants (test set). The distinction in these 2 use cases of separate data is sometimes not clear and depends on the number of model variants tested. Critically, with modern ML practice, many model variants are often tested on provided data, which readily leads to overfitting on both the original training data and validation set used for evaluation. A separate holdout test set would be needed to properly evaluate model performance [<xref ref-type="bibr" rid="ref33">33</xref>]. A single split can be error prone in estimating performance [<xref ref-type="bibr" rid="ref34">34</xref>]. It is critical to have a holdout test set to provide better performance estimation. Additionally, cross-validation is a technique largely used to estimate and compare model performance or to optimize the hyperparameters [<xref ref-type="bibr" rid="ref35">35</xref>]. Cross-validation divides the data into folds and iterates on these folds to test and train the models using different partitions of the data set. We found that 78.8% (89/113) of the studies used cross-validation; however, 5.3% (6/113) of the studies either did not mention the details of the validation procedure or did not do any splitting. A total of 9.7% (11/113) of the studies split the data set into only 2 sets, but it was not clear if the separate set was a validation set or a test set. Only 19.5% (22/113) of the studies applied cross-validation without a holdout test set (<xref ref-type="table" rid="table2">Table 2</xref> and Figure S1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Distribution of studies according to data set splitting techniques.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Data set splitting techniques</td><td align="left" valign="bottom">Studies (n=113), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Not mentioned</td><td align="left" valign="top">6 (5.3)</td></tr><tr><td align="left" valign="top">Split into 2 sets (training, test, or validation sets)</td><td align="left" valign="top">11 (9.7)</td></tr><tr><td align="left" valign="top">Only cross-validation</td><td align="left" valign="top">22 (19.5)</td></tr><tr><td align="left" valign="top">Split into 3 sets</td><td align="left" valign="top">7 (6.2)</td></tr><tr><td align="left" valign="top">Cross-validation and holdout test set</td><td align="left" valign="top">67 (59.3)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2-4"><title>Cross-Validation</title><p>There are multiple types of cross-validation techniques. In <italic>k</italic>-fold cross-validation, the data set is divided into <italic>k</italic> equal folds randomly, and the model is trained and evaluated <italic>k</italic> times. Each time, the model is trained using <italic>k</italic>&#x2013;1 folds and evaluated in the remaining fold. When the observations are independent and identically distributed, <italic>k</italic>-fold cross-validation works well. When the data are not identically distributed, <italic>k</italic>-fold cross-validation makes the model prone to overfitting and not generalize well [<xref ref-type="bibr" rid="ref36">36</xref>]. For instance, multiple data samples from the same patient should generally not be present in both training and testing data sets. Subject-wise cross-validation separates folds according to the subject. Although Saeb et al [<xref ref-type="bibr" rid="ref37">37</xref>] concluded that subject-wise methods are more clinically relevant compared to record-wise methods, Little et al [<xref ref-type="bibr" rid="ref38">38</xref>] argued that subject-wise methods might not be the best in all use cases. However, Westerhuis et al [<xref ref-type="bibr" rid="ref39">39</xref>] demonstrated that cross-validation can be overoptimistic and suggested that it is good practice to include a separate test set at the end to properly evaluate a model. To reduce bias in model evaluation, nested cross-validation is another technique that involves 2 cross-validation loops [<xref ref-type="bibr" rid="ref40">40</xref>]. The outer loop generates <italic>k</italic>-folds and iterates through them, so each fold is eventually used as a holdout test fold for a model developed using the remaining data. The inner loop uses a similar <italic>k</italic>-fold procedure to create a holdout validation fold that is used to select the best model during model tuning. Nested cross-validation is a more robust way to evaluate models than <italic>k</italic>-fold cross-validation alone, since using all available data to select the model architecture can lead to biased, overfitted results [<xref ref-type="bibr" rid="ref40">40</xref>]. However, nested cross-validation is more computationally intensive, and these models can be difficult to interpret or implement (since they actually result in <italic>k</italic>-best models, so performance is usually averaged over all <italic>k</italic>-best models). In our analysis, we found that the most common cross-validation technique is <italic>k</italic>-fold cross-validation (68/113, 60.2%), whereas only 4.4% (5/113) of the studies adopted nested cross-validation (<xref ref-type="table" rid="table3">Table 3</xref> and Figure S2 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Of the 113 studies, 20 (17.7%) adopted 2 types of cross-validation techniques, and 5 (4.4%) adopted 3 types of techniques.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Distribution of studies that adopted cross-validation techniques.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cross-validation techniques</td><td align="left" valign="bottom">Studies (n=113), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top"><italic>k</italic>-fold cross-validation</td><td align="left" valign="top">68 (60.2)</td></tr><tr><td align="left" valign="top">Leave-p-out cross-validation</td><td align="left" valign="top">25 (22.1)</td></tr><tr><td align="left" valign="top">Stratified or subject-wise cross-validation</td><td align="left" valign="top">21 (18.6)</td></tr><tr><td align="left" valign="top">Nested cross-validation</td><td align="left" valign="top">5 (4.4)</td></tr><tr><td align="left" valign="top">No cross-validation</td><td align="left" valign="top">24 (21.2)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2-5"><title>Overfitting</title><p>We selected publications that did not evaluate their models with a holdout test set and then we analyzed if they mentioned that the proposed models could possibly be overfitting. Models can be overfitted for multiple reasons, such as an imbalanced data set or the lack of proper model selection and validation technique. Even with cross-validation, if a separate holdout set is not used, then the results can be inflated. Rao et al [<xref ref-type="bibr" rid="ref41">41</xref>] demonstrated that leave-one-out cross-validation can achieve 100% sensitivity, but performance on a holdout test set can be significantly lower. Cross-validation alone is not sufficient model validation when the dimensionality of the data is high [<xref ref-type="bibr" rid="ref41">41</xref>]. However, there are multiple ways to address or prevent overfitting, such as the examples provided by Ying [<xref ref-type="bibr" rid="ref42">42</xref>]. Making the reader aware of overfitting concerns in the interpretability of results should be standard practice. Therefore, we searched to see if the authors mentioned that their model can suffer from overfitting. For this analysis, we excluded studies that applied the cross-validation technique with a holdout test set. We found that just over 54% (25/46) of the studies that likely suffer from overfitting did not mention it as a concern. Although 45% (21/46) of studies mentioned overfitting as a potential limitation, many of them did not have any detailed discussion about this.</p></sec><sec id="s3-2-6"><title>Hyperparameters</title><p>While training a model, hyperparameters are selected to define the architecture of the model. These hyperparameters are often tuned so that the model gives the best performance. A common method of finding the best hyperparameters is by defining a range of parameters to test, then applying a grid search or random search on the fixed search space, and finally selecting parameters to minimize the model error [<xref ref-type="bibr" rid="ref43">43</xref>]. These methods can be extremely computationally expensive and time-consuming depending on data complexity and available computation power [<xref ref-type="bibr" rid="ref44">44</xref>]. Regardless of the method applied, it is considered good practice to make clear statements about the tuning process of hyperparameters to improve reproducibility [<xref ref-type="bibr" rid="ref45">45</xref>]. This practice ensures parameters are properly selected and models are ready for direct comparison. Our results demonstrated that 38.9% (44/113) of studies did not report on hyperparameter tuning (<xref ref-type="table" rid="table4">Table 4</xref> and Figure S3 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Of these, 2 adopted least absolute shrinkage and selection operator logistic regression, and 3 used a variant of logistic regression or linear regression, which typically have few or no hyperparameters to adjust.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Distribution of studies according to hyperparameter tuning methods.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Hyperparameter tuning methods</td><td align="left" valign="bottom">Studies (n=113), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Not reported</td><td align="left" valign="top">44 (38.9)</td></tr><tr><td align="left" valign="top">Ad hoc</td><td align="left" valign="top">31 (27.4)</td></tr><tr><td align="left" valign="top">Random search</td><td align="left" valign="top">1 (0.9)</td></tr><tr><td align="left" valign="top">Grid search</td><td align="left" valign="top">27 (23.9)</td></tr><tr><td align="left" valign="top">Others</td><td align="left" valign="top">10 (8.8)</td></tr></tbody></table></table-wrap><p>For many other models, there are inherently only a few hyperparameters that are usually adjusted; for instance, the major hyperparameter for the neighbor model is the number of neighbors, <italic>k</italic>. On the other hand, more complex models such as convolutional neural networks and LSTM require thorough tuning to achieve meaningful performance. Regardless of the number of hyperparameters in a model, proper tuning would likely still contribute to achieving optimal performance. The choice of hyperparameters will impact model generalization, so it is worthwhile to examine changes in performance with different settings [<xref ref-type="bibr" rid="ref46">46</xref>].</p></sec><sec id="s3-2-7"><title>Model Comparison</title><p>In research domains that require complex deep learning models to achieve state-of-the-art performance, such as computer vision and natural language processing, it has become a regular practice to compare models with numeric benchmark data sets to contextualize their proposed model and provide insight into the model&#x2019;s relative performance to peers. Although such rigorous benchmarking and comparison is not possible given the heterogeneous data sets in PD research, it is important to contextualize a model&#x2019;s performance relative to other models, strategies, and data sets. We found that 66.4% (75/113) of studies compared results from multiple alternative models in their work, and 15% (17/113) of studies compared their results with previously published models. However, 18.6% (21/113) of studies only reported their single model performance and made no comparison to any other models or benchmarks (<xref ref-type="table" rid="table5">Table 5</xref> and Figure S4 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Distribution of studies according to model comparison methods; 18.6% (21/113) of studies did not compare their model results to any alternative models or previously published models or benchmarks.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model comparison methods</td><td align="left" valign="bottom">Studies (n=113), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Compared with their own multiple models</td><td align="left" valign="top">75 (66.4)</td></tr><tr><td align="left" valign="top">Compared with previous models or benchmarks</td><td align="left" valign="top">4 (3.5)</td></tr><tr><td align="left" valign="top">Compared with previous models and their own multiple models</td><td align="left" valign="top">13 (11.5)</td></tr><tr><td align="left" valign="top">No comparisons</td><td align="left" valign="top">21 (18.6)</td></tr></tbody></table></table-wrap></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In summary, we have comprehensively reviewed the general practices of ML research applied to PD in a recent cross-section of publications. We have identified several important areas of improvement for model building to reduce the disparity between in-the-lab research and real-world clinical applications. Standardizing the model reporting techniques and implementing best ML practices would increase the acceptability and reliability of these models to improve patient evaluation and care [<xref ref-type="bibr" rid="ref47">47</xref>].</p><p>For the interoperability and usability of the models, clinicians need detailed information about the patients included in the model&#x2019;s training data, such as their medication state and PD progression stage. This information determines the predictive validity of a model to new patients and settings. We found that 94.7% (107/113) of the studies explained the PD severity of their patients, whereas only 46.9% (53/113) of studies reported the medication state of the patients. To incorporate data-driven algorithms in real life, the description of medication is significantly relevant to PD [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. The overall representation of demographic samples in the training set should be accounted for as well. Our results show that 68.1% (77/113) of the studies had a class imbalance greater than 30% difference in their data set, and less than one-third (from 5/27, 18% to 5/16, 31%) of the studies addressed imbalance as a potential issue or considered its impact on the model results.</p><p>Another major finding is the lack of a standard reporting framework for a model&#x2019;s hyperparameter search and tuning. Hyperparameter tuning has a major impact on the model configuration and, by extension, its performance [<xref ref-type="bibr" rid="ref50">50</xref>]. For example, Wong et al [<xref ref-type="bibr" rid="ref51">51</xref>] demonstrated that a model using tuned (grid-searched) hyperparameters outperformed a model using default hyperparameters. Addressing hyperparameters is also essential for reproducibility, including a report on the final model configuration and how the authors made the decision. Although this is a considerably important aspect of ML model reporting, our study showed that 44 (38.9%) of the 113 studies did not report the hyperparameter tuning approach. Of these, 5 studies adopted logistic regression or linear regression. Traditional regression models are not expected to undergo significant hyperparameter tuning; however, variants that involve hyperparameters would likely still benefit from tuning. Consistent reporting of hyperparameter tuning practices will enhance the robustness and reliability of these models.</p><p>Moreover, to provide context to the results of model performance, comparisons of different models or with previously published models give a general idea of the quality of the proposed models. We found that 18.6% (21/113) of the studies only reported their proposed models; on the contrary, the reporting standard of proposed models in the computer vision and natural language processing fields is extensive. For instance, Wang et al [<xref ref-type="bibr" rid="ref52">52</xref>] and Liu et al [<xref ref-type="bibr" rid="ref53">53</xref>] proposed methods for visual recognition, and they reported large-scale experiment results with different data sets and compared their results with more than 10 previously proposed methods. Similarly, in natural language processing, to propose a task such as emotion cause extraction, Xia and Ding [<xref ref-type="bibr" rid="ref54">54</xref>] compared around 8 methods with different evaluation metrics. These are a few cases to demonstrate that such comparisons are widely executed in the computer vision and natural language processing communities to propose a method. This systematic practice of comparison with previously published approaches results in reproducibility. Unfortunately, we found that only 15% (17/113) of the studies compared with previously proposed methods. However, in the medical field, due to the challenges of data availability, proper comparisons might not be possible.</p><p>There are several factors in ML and deep learning research that can create misleading results. One major factor is proper model validation, particularly in how the training and test data are separated. We found that 5.3% (6/113) of studies either did not provide the details about data set splitting or did not do any splitting, and 15.9% (18/113) of studies performed static training, validation, and test set separation, which provides limited stability of scores. Cross-validation is a more stable validation method conducted while training the model and reduces the risk of overfitting [<xref ref-type="bibr" rid="ref55">55</xref>]. The majority (89/113, 78.8%) of studies adopted some form of cross-validation, and the most common cross-validation technique adopted was <italic>k</italic>-fold (68/113, 60.2%). Nevertheless, the use case of different validation techniques depends on the data set and is problem specific. As powerful as cross-validation is in creating reliable models, applying simple cross-validation does not guarantee that the model is not overfitted [<xref ref-type="bibr" rid="ref41">41</xref>]. For the studies that did not evaluate their results with a holdout test set in a cross-validation manner, we extracted information from their discussion sections. To be precise, we checked if they made their reader aware of how the study results might be overfitting. We found that 46% (21/46) of the studies that are potentially reporting overfitted scores did not mention this concern. The developed models should be reported with their limitations for transparency to allow for further improvement and real-world adoption.</p><p>In this systematic review, we sampled 113 recent studies on PD to summarize the standard ML practices and addressed broader concerns on reporting strategies. It is challenging for authors to always implement the best practices considering the practical realities of health care data, including limited sample sizes, noisy data, medical data privacy, etc. However, whenever possible, authors should consider these reporting practices, especially to acknowledge limitations in their data, model design, and performance. This will help to determine reasonable use cases for these models or to identify areas of improvement before they are ready for clinical translation. These considerations can also extend to other health care applications of ML.</p></sec><sec id="s4-2"><title>Conclusion</title><p>Despite the increasing number of studies, our results demonstrate there are still many opportunities for improvement in reporting and implementing ML for applications in PD detection and progression. Studies should report detailed, standardized patient characteristics; use robust validation techniques to ensure the model&#x2019;s reliability; and justify choices of evaluation as well as hyperparameters. We found that 75% (58/77) of the studies sampled from 2020 to 2021 did not address class imbalance, and one-third (44/113, 38.9%) of studies did not report hyperparameter tuning. Reporting is the first step to understanding the usability and interpretation of models. By shifting the focus to the critical evaluation of these methods, we aim to improve the reporting and review of ML to strengthen the connection between research and real-world clinical applications. Ideally, the processes can be standardized, and clinical measurements can be leveraged more effectively for prediction models to improve the real-world impact on individuals with PD or other health conditions.</p></sec></sec></body><back><notes><sec><title>Data Availability</title><p>All data generated or analyzed during this study are included in this paper.</p></sec></notes><fn-group><fn fn-type="con"><p>TT, MVA, and MKO conceptualized the study. TT and RCS conducted the review, extracted the data, and conducted the analysis. TT wrote the paper. MKO and MVA revised the paper and supervised the study. All authors reviewed the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">H&#x0026;Y</term><def><p>Hoehn and Yahr</p></def></def-item><def-item><term id="abb2">LSTM</term><def><p>long short-term memory</p></def></def-item><def-item><term id="abb3">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb4">PD</term><def><p>Parkinson disease</p></def></def-item><def-item><term id="abb5">PRISMA</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garrote</surname><given-names>JAD</given-names></name><name name-style="western"><surname>Cervantes</surname><given-names>CE</given-names></name><name name-style="western"><surname>D&#x00ED;az</surname><given-names>MS</given-names></name></person-group><article-title>Prediagnostic presentations of Parkinson&#x2019;s disease in primary care: a case-control study [Article in Spanish]</article-title><source>Semergen</source><year>2015</year><volume>41</volume><issue>5</issue><fpage>284</fpage><lpage>286</lpage><pub-id pub-id-type="doi">10.1016/j.semerg.2015.01.007</pub-id><pub-id pub-id-type="medline">25752864</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rizzo</surname><given-names>G</given-names></name><name name-style="western"><surname>Copetti</surname><given-names>M</given-names></name><name name-style="western"><surname>Arcuti</surname><given-names>S</given-names></name><name name-style="western"><surname>Martino</surname><given-names>D</given-names></name><name name-style="western"><surname>Fontana</surname><given-names>A</given-names></name><name name-style="western"><surname>Logroscino</surname><given-names>G</given-names></name></person-group><article-title>Accuracy of clinical diagnosis of Parkinson disease: a systematic review and meta-analysis</article-title><source>Neurology</source><year>2016</year><month>02</month><day>9</day><volume>86</volume><issue>6</issue><fpage>566</fpage><lpage>576</lpage><pub-id pub-id-type="doi">10.1212/WNL.0000000000002350</pub-id><pub-id pub-id-type="medline">26764028</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pagan</surname><given-names>FL</given-names></name></person-group><article-title>Improving outcomes through early diagnosis of Parkinson&#x2019;s disease</article-title><source>Am J Manag Care</source><year>2012</year><month>09</month><volume>18</volume><issue>7 Suppl</issue><fpage>S176</fpage><lpage>S182</lpage><pub-id pub-id-type="medline">23039866</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Postuma</surname><given-names>RB</given-names></name><name name-style="western"><surname>Berg</surname><given-names>D</given-names></name></person-group><article-title>Advances in markers of prodromal Parkinson disease</article-title><source>Nat Rev Neurol</source><year>2016</year><month>10</month><day>27</day><volume>12</volume><issue>11</issue><fpage>622</fpage><lpage>634</lpage><pub-id pub-id-type="doi">10.1038/nrneurol.2016.152</pub-id><pub-id pub-id-type="medline">27786242</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jankovic</surname><given-names>J</given-names></name></person-group><article-title>Parkinson&#x2019;s disease: clinical features and diagnosis</article-title><source>J Neurol Neurosurg Psychiatry</source><year>2008</year><month>04</month><volume>79</volume><issue>4</issue><fpage>368</fpage><lpage>376</lpage><pub-id pub-id-type="doi">10.1136/jnnp.2007.131045</pub-id><pub-id pub-id-type="medline">18344392</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Massano</surname><given-names>J</given-names></name><name name-style="western"><surname>Bhatia</surname><given-names>KP</given-names></name></person-group><article-title>Clinical approach to Parkinson&#x2019;s disease: features, diagnosis, and principles of management</article-title><source>Cold Spring Harb Perspect Med</source><year>2012</year><month>06</month><volume>2</volume><issue>6</issue><fpage>a008870</fpage><pub-id pub-id-type="doi">10.1101/cshperspect.a008870</pub-id><pub-id pub-id-type="medline">22675666</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names></name></person-group><article-title>Mining imaging and clinical data with machine learning approaches for the diagnosis and early detection of Parkinson&#x2019;s disease</article-title><source>NPJ Parkinsons Dis</source><year>2022</year><month>01</month><day>21</day><volume>8</volume><issue>1</issue><fpage>13</fpage><pub-id pub-id-type="doi">10.1038/s41531-021-00266-8</pub-id><pub-id pub-id-type="medline">35064123</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Miljkovic</surname><given-names>D</given-names></name><name name-style="western"><surname>Aleksovski</surname><given-names>D</given-names></name><name name-style="western"><surname>Podpe&#x010D;an</surname><given-names>V</given-names></name><name name-style="western"><surname>Lavra&#x010D;</surname><given-names>N</given-names></name><name name-style="western"><surname>Malle</surname><given-names>B</given-names></name><name name-style="western"><surname>Holzinger</surname><given-names>A</given-names></name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Holzinger</surname><given-names>A</given-names></name></person-group><article-title>Machine learning and data mining methods for managing Parkinson&#x2019;s disease</article-title><source>Machine Learning for Health Informatics. Lecture Notes in Computer Science</source><year>2016</year><volume>9605</volume><publisher-name>Springer</publisher-name><fpage>209</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.1007/978-3-319-50478-0_10</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Russell</surname><given-names>SJ</given-names></name><name name-style="western"><surname>Norvig</surname><given-names>P</given-names></name></person-group><source>Artificial Intelligence: A Modern Approach</source><year>2003</year><publisher-name>Prentice Hall/Pearson Education</publisher-name></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Contreras</surname><given-names>I</given-names></name><name name-style="western"><surname>Vehi</surname><given-names>J</given-names></name></person-group><article-title>Artificial intelligence for diabetes management and decision support: literature review</article-title><source>J Med Internet Res</source><year>2018</year><month>05</month><day>30</day><volume>20</volume><issue>5</issue><fpage>e10775</fpage><pub-id pub-id-type="doi">10.2196/10775</pub-id><pub-id pub-id-type="medline">29848472</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>JH</given-names></name><name name-style="western"><surname>Asch</surname><given-names>SM</given-names></name></person-group><article-title>Machine learning and prediction in medicine &#x2014; beyond the peak of inflated expectations</article-title><source>N Engl J Med</source><year>2017</year><month>06</month><day>29</day><volume>376</volume><issue>26</issue><fpage>2507</fpage><lpage>2509</lpage><pub-id pub-id-type="doi">10.1056/NEJMp1702071</pub-id><pub-id pub-id-type="medline">28657867</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bind</surname><given-names>S</given-names></name><name name-style="western"><surname>Tiwari</surname><given-names>AK</given-names></name><name name-style="western"><surname>Sahani</surname><given-names>AK</given-names></name><etal/></person-group><article-title>A survey of machine learning based approaches for Parkinson disease prediction</article-title><source>International Journal of Computer Science and Information Technologies</source><year>2015</year><access-date>2024-04-29</access-date><volume>6</volume><issue>2</issue><fpage>1648</fpage><lpage>1655</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.ijcsit.com/docs/Volume%206/vol6issue02/ijcsit20150602163.pdf">https://www.ijcsit.com/docs/Volume%206/vol6issue02/ijcsit20150602163.pdf</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salari</surname><given-names>N</given-names></name><name name-style="western"><surname>Kazeminia</surname><given-names>M</given-names></name><name name-style="western"><surname>Sagha</surname><given-names>H</given-names></name><name name-style="western"><surname>Daneshkhah</surname><given-names>A</given-names></name><name name-style="western"><surname>Ahmadi</surname><given-names>A</given-names></name><name name-style="western"><surname>Mohammadi</surname><given-names>M</given-names></name></person-group><article-title>The performance of various machine learning methods for Parkinson&#x2019;s disease recognition: a systematic review</article-title><source>Curr Psychol</source><year>2023</year><month>07</month><volume>42</volume><issue>20</issue><fpage>16637</fpage><lpage>16660</lpage><pub-id pub-id-type="doi">10.1007/s12144-022-02949-8</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ramdhani</surname><given-names>RA</given-names></name><name name-style="western"><surname>Khojandi</surname><given-names>A</given-names></name><name name-style="western"><surname>Shylo</surname><given-names>O</given-names></name><name name-style="western"><surname>Kopell</surname><given-names>BH</given-names></name></person-group><article-title>Optimizing clinical assessments in Parkinson&#x2019;s disease through the use of wearable sensors and data driven modeling</article-title><source>Front Comput Neurosci</source><year>2018</year><month>09</month><day>11</day><volume>12</volume><fpage>72</fpage><pub-id pub-id-type="doi">10.3389/fncom.2018.00072</pub-id><pub-id pub-id-type="medline">30254580</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mei</surname><given-names>J</given-names></name><name name-style="western"><surname>Desrosiers</surname><given-names>C</given-names></name><name name-style="western"><surname>Frasnelli</surname><given-names>J</given-names></name></person-group><article-title>Machine learning for the diagnosis of Parkinson's disease: a review of literature</article-title><source>Front Aging Neurosci</source><year>2021</year><month>05</month><day>6</day><volume>13</volume><fpage>633752</fpage><pub-id pub-id-type="doi">10.3389/fnagi.2021.633752</pub-id><pub-id pub-id-type="medline">34025389</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mart&#x00ED;nez-Mart&#x00ED;n</surname><given-names>P</given-names></name><name name-style="western"><surname>Gil-Nagel</surname><given-names>A</given-names></name><name name-style="western"><surname>Gracia</surname><given-names>LM</given-names></name><name name-style="western"><surname>G&#x00F3;mez</surname><given-names>JB</given-names></name><name name-style="western"><surname>Mart&#x00ED;nez-Sarri&#x00E9;s</surname><given-names>J</given-names></name><name name-style="western"><surname>Bermejo</surname><given-names>F</given-names></name></person-group><article-title>Unified Parkinson&#x2019;s Disease Rating Scale characteristics and structure</article-title><source>Mov Disord</source><year>1994</year><month>01</month><volume>9</volume><issue>1</issue><fpage>76</fpage><lpage>83</lpage><pub-id pub-id-type="doi">10.1002/mds.870090112</pub-id><pub-id pub-id-type="medline">8139608</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hoehn</surname><given-names>MM</given-names></name><name name-style="western"><surname>Yahr</surname><given-names>MD</given-names></name></person-group><article-title>Parkinsonism: onset, progression, and mortality</article-title><source>Neurology</source><year>1967</year><month>05</month><volume>17</volume><issue>5</issue><fpage>427</fpage><lpage>442</lpage><pub-id pub-id-type="doi">10.1212/wnl.17.5.427</pub-id><pub-id pub-id-type="medline">6067254</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Verbaan</surname><given-names>D</given-names></name><name name-style="western"><surname>van Rooden</surname><given-names>SM</given-names></name><name name-style="western"><surname>van Hilten</surname><given-names>JJ</given-names></name><name name-style="western"><surname>Rijsman</surname><given-names>RM</given-names></name></person-group><article-title>Prevalence and clinical profile of restless legs syndrome in Parkinson&#x2019;s disease</article-title><source>Mov Disord</source><year>2010</year><month>10</month><day>15</day><volume>25</volume><issue>13</issue><fpage>2142</fpage><lpage>2147</lpage><pub-id pub-id-type="doi">10.1002/mds.23241</pub-id><pub-id pub-id-type="medline">20737549</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mart&#x00ED;nez-Fern&#x00E1;ndez</surname><given-names>R</given-names></name><name name-style="western"><surname>Schmitt</surname><given-names>E</given-names></name><name name-style="western"><surname>Martinez-Martin</surname><given-names>P</given-names></name><name name-style="western"><surname>Krack</surname><given-names>P</given-names></name></person-group><article-title>The hidden sister of motor fluctuations in Parkinson&#x2019;s disease: a review on nonmotor fluctuations</article-title><source>Mov Disord</source><year>2016</year><month>08</month><volume>31</volume><issue>8</issue><fpage>1080</fpage><lpage>1094</lpage><pub-id pub-id-type="doi">10.1002/mds.26731</pub-id><pub-id pub-id-type="medline">27431515</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jahanshahi</surname><given-names>M</given-names></name><name name-style="western"><surname>Wilkinson</surname><given-names>L</given-names></name><name name-style="western"><surname>Gahir</surname><given-names>H</given-names></name><name name-style="western"><surname>Dharmaindra</surname><given-names>A</given-names></name><name name-style="western"><surname>Lagnado</surname><given-names>DA</given-names></name></person-group><article-title>Medication impairs probabilistic classification learning in Parkinson&#x2019;s disease</article-title><source>Neuropsychologia</source><year>2010</year><month>03</month><volume>48</volume><issue>4</issue><fpage>1096</fpage><lpage>1103</lpage><pub-id pub-id-type="doi">10.1016/j.neuropsychologia.2009.12.010</pub-id><pub-id pub-id-type="medline">20006629</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Warmerdam</surname><given-names>E</given-names></name><name name-style="western"><surname>Romijnders</surname><given-names>R</given-names></name><name name-style="western"><surname>Hansen</surname><given-names>C</given-names></name><etal/></person-group><article-title>Arm swing responsiveness to dopaminergic medication in Parkinson&#x2019;s disease depends on task complexity</article-title><source>NPJ Parkinsons Dis</source><year>2021</year><month>10</month><day>5</day><volume>7</volume><issue>1</issue><fpage>89</fpage><pub-id pub-id-type="doi">10.1038/s41531-021-00235-1</pub-id><pub-id pub-id-type="medline">34611152</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>Q</given-names></name><name name-style="western"><surname>Jiang</surname><given-names>S</given-names></name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names></name></person-group><article-title>The performance stability of defect prediction models with class imbalance: an empirical study</article-title><source>IEICE Trans Inf Syst</source><year>2017</year><volume>E100.D</volume><issue>2</issue><fpage>265</fpage><lpage>272</lpage><pub-id pub-id-type="doi">10.1587/transinf.2016EDP7204</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dinov</surname><given-names>ID</given-names></name><name name-style="western"><surname>Heavner</surname><given-names>B</given-names></name><name name-style="western"><surname>Tang</surname><given-names>M</given-names></name><etal/></person-group><article-title>Predictive big data analytics: a study of Parkinson's disease using large, complex, heterogeneous, incongruent, multi-source and incomplete observations</article-title><source>PLoS One</source><year>2016</year><month>08</month><day>5</day><volume>11</volume><issue>8</issue><fpage>e0157077</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0157077</pub-id><pub-id pub-id-type="medline">27494614</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Brownlee</surname><given-names>J</given-names></name></person-group><source>Imbalanced Classification with Python: Choose Better Metrics, Balance Skewed Classes, and Apply Cost-Sensitive Learning</source><year>2020</year><publisher-name>Machine Learning Mastery</publisher-name></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Fern&#x00E1;ndez</surname><given-names>A</given-names></name><name name-style="western"><surname>Garc&#x00ED;a</surname><given-names>S</given-names></name><name name-style="western"><surname>Galar</surname><given-names>M</given-names></name><name name-style="western"><surname>Prati</surname><given-names>RC</given-names></name><name name-style="western"><surname>Krawczyk</surname><given-names>B</given-names></name><name name-style="western"><surname>Herrera</surname><given-names>F</given-names></name></person-group><source>Learning from Imbalanced Data Sets</source><year>2018</year><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-3-319-98074-4</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>H</given-names></name><name name-style="western"><surname>Garcia</surname><given-names>EA</given-names></name></person-group><article-title>Learning from imbalanced data</article-title><source>IEEE Trans Knowl Data Eng</source><year>2009</year><month>09</month><volume>21</volume><issue>9</issue><fpage>1263</fpage><lpage>1284</lpage><pub-id pub-id-type="doi">10.1109/TKDE.2008.239</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Megahed</surname><given-names>FM</given-names></name><name name-style="western"><surname>Chen</surname><given-names>YJ</given-names></name><name name-style="western"><surname>Megahed</surname><given-names>A</given-names></name><name name-style="western"><surname>Ong</surname><given-names>Y</given-names></name><name name-style="western"><surname>Altman</surname><given-names>N</given-names></name><name name-style="western"><surname>Krzywinski</surname><given-names>M</given-names></name></person-group><article-title>The class imbalance problem</article-title><source>Nat Methods</source><year>2021</year><month>11</month><volume>18</volume><issue>11</issue><fpage>1270</fpage><lpage>1272</lpage><pub-id pub-id-type="doi">10.1038/s41592-021-01302-4</pub-id><pub-id pub-id-type="medline">34654918</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van den Goorbergh</surname><given-names>R</given-names></name><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names></name><name name-style="western"><surname>Timmerman</surname><given-names>D</given-names></name><name name-style="western"><surname>van Calster</surname><given-names>B</given-names></name></person-group><article-title>The harm of class imbalance corrections for risk prediction models: illustration and simulation using logistic regression</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>08</month><day>16</day><volume>29</volume><issue>9</issue><fpage>1525</fpage><lpage>1534</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac093</pub-id><pub-id pub-id-type="medline">35686364</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moon</surname><given-names>S</given-names></name><name name-style="western"><surname>Song</surname><given-names>HJ</given-names></name><name name-style="western"><surname>Sharma</surname><given-names>VD</given-names></name><etal/></person-group><article-title>Classification of Parkinson&#x2019;s disease and essential tremor based on balance and gait characteristics from wearable motion sensors via machine learning techniques: a data-driven approach</article-title><source>J Neuroeng Rehabil</source><year>2020</year><month>09</month><day>11</day><volume>17</volume><issue>1</issue><fpage>125</fpage><pub-id pub-id-type="doi">10.1186/s12984-020-00756-5</pub-id><pub-id pub-id-type="medline">32917244</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Veeraragavan</surname><given-names>S</given-names></name><name name-style="western"><surname>Gopalai</surname><given-names>AA</given-names></name><name name-style="western"><surname>Gouwanda</surname><given-names>D</given-names></name><name name-style="western"><surname>Ahmad</surname><given-names>SA</given-names></name></person-group><article-title>Parkinson&#x2019;s disease diagnosis and severity assessment using ground reaction forces and neural networks</article-title><source>Front Physiol</source><year>2020</year><month>11</month><day>9</day><volume>11</volume><fpage>587057</fpage><pub-id pub-id-type="doi">10.3389/fphys.2020.587057</pub-id><pub-id pub-id-type="medline">33240106</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Falchetti</surname><given-names>M</given-names></name><name name-style="western"><surname>Prediger</surname><given-names>RD</given-names></name><name name-style="western"><surname>Zanotto-Filho</surname><given-names>A</given-names></name></person-group><article-title>Classification algorithms applied to blood-based transcriptome meta-analysis to predict idiopathic Parkinson&#x2019;s disease</article-title><source>Comput Biol Med</source><year>2020</year><month>09</month><volume>124</volume><fpage>103925</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2020.103925</pub-id><pub-id pub-id-type="medline">32889300</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeancolas</surname><given-names>L</given-names></name><name name-style="western"><surname>Petrovska-Delacr&#x00E9;taz</surname><given-names>D</given-names></name><name name-style="western"><surname>Mangone</surname><given-names>G</given-names></name><etal/></person-group><article-title>X-vectors: new quantitative biomarkers for early Parkinson&#x2019;s disease detection from speech</article-title><source>Front Neuroinform</source><year>2021</year><month>02</month><day>19</day><volume>15</volume><fpage>578369</fpage><pub-id pub-id-type="doi">10.3389/fninf.2021.578369</pub-id><pub-id pub-id-type="medline">33679361</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lever</surname><given-names>J</given-names></name><name name-style="western"><surname>Krzywinski</surname><given-names>M</given-names></name><name name-style="western"><surname>Altman</surname><given-names>N</given-names></name></person-group><article-title>Model selection and overfitting</article-title><source>Nat Methods</source><year>2016</year><month>09</month><volume>13</volume><issue>9</issue><fpage>703</fpage><lpage>704</lpage><pub-id pub-id-type="doi">10.1038/nmeth.3968</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harrington</surname><given-names>PdB</given-names></name></person-group><article-title>Multiple versus single set validation of multivariate models to avoid mistakes</article-title><source>Crit Rev Anal Chem</source><year>2018</year><month>01</month><day>2</day><volume>48</volume><issue>1</issue><fpage>33</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1080/10408347.2017.1361314</pub-id><pub-id pub-id-type="medline">28777019</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Refaeilzadeh</surname><given-names>P</given-names></name><name name-style="western"><surname>Tang</surname><given-names>L</given-names></name><name name-style="western"><surname>Liu</surname><given-names>H</given-names></name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Liu</surname><given-names>L</given-names></name><name name-style="western"><surname>&#x00D6;zsu</surname><given-names>MT</given-names></name></person-group><article-title>Cross-validation</article-title><source>Encyclopedia of Database Systems</source><year>2009</year><publisher-name>Springer</publisher-name><fpage>532</fpage><lpage>538</lpage><pub-id pub-id-type="doi">10.1007/978-0-387-39940-9_565</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dietterich</surname><given-names>TG</given-names></name></person-group><article-title>Approximate statistical tests for comparing supervised classification learning algorithms</article-title><source>Neural Comput</source><year>1998</year><month>09</month><day>15</day><volume>10</volume><issue>7</issue><fpage>1895</fpage><lpage>1923</lpage><pub-id pub-id-type="doi">10.1162/089976698300017197</pub-id><pub-id pub-id-type="medline">9744903</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saeb</surname><given-names>S</given-names></name><name name-style="western"><surname>Lonini</surname><given-names>L</given-names></name><name name-style="western"><surname>Jayaraman</surname><given-names>A</given-names></name><name name-style="western"><surname>Mohr</surname><given-names>DC</given-names></name><name name-style="western"><surname>Kording</surname><given-names>KP</given-names></name></person-group><article-title>The need to approximate the use-case in clinical machine learning</article-title><source>Gigascience</source><year>2017</year><month>05</month><day>1</day><volume>6</volume><issue>5</issue><fpage>1</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1093/gigascience/gix019</pub-id><pub-id pub-id-type="medline">28327985</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Little</surname><given-names>MA</given-names></name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names></name><name name-style="western"><surname>Saeb</surname><given-names>S</given-names></name><etal/></person-group><article-title>Using and understanding cross-validation strategies. perspectives on Saeb et al</article-title><source>Gigascience</source><year>2017</year><month>05</month><day>1</day><volume>6</volume><issue>5</issue><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1093/gigascience/gix020</pub-id><pub-id pub-id-type="medline">28327989</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Westerhuis</surname><given-names>JA</given-names></name><name name-style="western"><surname>Hoefsloot</surname><given-names>HCJ</given-names></name><name name-style="western"><surname>Smit</surname><given-names>S</given-names></name><etal/></person-group><article-title>Assessment of PLSDA cross validation</article-title><source>Metabolomics</source><year>2008</year><month>03</month><volume>4</volume><issue>1</issue><fpage>81</fpage><lpage>89</lpage><pub-id pub-id-type="doi">10.1007/s11306-007-0099-6</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cawley</surname><given-names>GC</given-names></name><name name-style="western"><surname>Talbo</surname><given-names>NLC</given-names></name></person-group><article-title>On over-fitting in model selection and subsequent selection bias in performance evaluation</article-title><source>J Mach Learn Res</source><year>2010</year><month>10</month><day>7</day><access-date>2024-04-30</access-date><volume>11</volume><fpage>2079</fpage><lpage>2107</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf">https://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>RB</given-names></name><name name-style="western"><surname>Fung</surname><given-names>G</given-names></name><name name-style="western"><surname>Rosales</surname><given-names>R</given-names></name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Apte</surname><given-names>C</given-names></name><name name-style="western"><surname>Park</surname><given-names>H</given-names></name><name name-style="western"><surname>Wang</surname><given-names>K</given-names></name><etal/></person-group><article-title>On the dangers of cross-validation. an experimental evaluation</article-title><source>Proceedings of the 2008 SIAM International Conference on Data Mining</source><year>2008</year><publisher-name>Society for Industrial and Applied Mathematics</publisher-name><fpage>588</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1137/1.9781611972788.54</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ying</surname><given-names>X</given-names></name></person-group><article-title>An overview of overfitting and its solutions</article-title><source>J Phys Conf Ser</source><year>2019</year><volume>1168</volume><issue>2</issue><fpage>022022</fpage><pub-id pub-id-type="doi">10.1088/1742-6596/1168/2/022022</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bergstra</surname><given-names>J</given-names></name><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names></name></person-group><article-title>Random search for hyper-parameter optimization</article-title><source>J Mach Learn Res</source><year>2012</year><month>12</month><day>2</day><access-date>2024-04-30</access-date><volume>13</volume><fpage>281</fpage><lpage>305</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf">https://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Claesen</surname><given-names>M</given-names></name><name name-style="western"><surname>de Moor</surname><given-names>B</given-names></name></person-group><article-title>Hyperparameter search in machine learning</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 6, 2015</comment><pub-id pub-id-type="doi">10.48550/arXiv.1502.02127</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stevens</surname><given-names>LM</given-names></name><name name-style="western"><surname>Mortazavi</surname><given-names>BJ</given-names></name><name name-style="western"><surname>Deo</surname><given-names>RC</given-names></name><name name-style="western"><surname>Curtis</surname><given-names>L</given-names></name><name name-style="western"><surname>Kao</surname><given-names>DP</given-names></name></person-group><article-title>Recommendations for reporting machine learning analyses in clinical research</article-title><source>Circ Cardiovasc Qual Outcomes</source><year>2020</year><month>10</month><volume>13</volume><issue>10</issue><fpage>e006556</fpage><pub-id pub-id-type="doi">10.1161/CIRCOUTCOMES.120.006556</pub-id><pub-id pub-id-type="medline">33079589</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>L</given-names></name><name name-style="western"><surname>Shami</surname><given-names>A</given-names></name></person-group><article-title>On hyperparameter optimization of machine learning algorithms: theory and practice</article-title><source>Neurocomputing</source><year>2020</year><month>11</month><day>20</day><volume>415</volume><fpage>295</fpage><lpage>316</lpage><pub-id pub-id-type="doi">10.1016/j.neucom.2020.07.061</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bin Rafiq</surname><given-names>R</given-names></name><name name-style="western"><surname>Modave</surname><given-names>F</given-names></name><name name-style="western"><surname>Guha</surname><given-names>S</given-names></name><name name-style="western"><surname>Albert</surname><given-names>MV</given-names></name></person-group><article-title>Validation methods to promote real-world applicability of machine learning in medicine</article-title><source>DMIP &#x2019;20: 2020 3rd International Conference on Digital Medicine and Image Processing</source><year>2020</year><publisher-name>Association for Computing Machinery</publisher-name><fpage>13</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1145/3441369.3441372</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goberman</surname><given-names>A</given-names></name><name name-style="western"><surname>Coelho</surname><given-names>C</given-names></name><name name-style="western"><surname>Robb</surname><given-names>M</given-names></name></person-group><article-title>Phonatory characteristics of Parkinsonian speech before and after morning medication: the on and off states</article-title><source>J Commun Disord</source><year>2002</year><volume>35</volume><issue>3</issue><fpage>217</fpage><lpage>239</lpage><pub-id pub-id-type="doi">10.1016/s0021-9924(01)00072-7</pub-id><pub-id pub-id-type="medline">12064785</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adamson</surname><given-names>MB</given-names></name><name name-style="western"><surname>Gilmore</surname><given-names>G</given-names></name><name name-style="western"><surname>Stratton</surname><given-names>TW</given-names></name><name name-style="western"><surname>Baktash</surname><given-names>N</given-names></name><name name-style="western"><surname>Jog</surname><given-names>MS</given-names></name></person-group><article-title>Medication status and dual-tasking on turning strategies in Parkinson disease</article-title><source>J Neurol Sci</source><year>2019</year><month>01</month><day>15</day><volume>396</volume><fpage>206</fpage><lpage>212</lpage><pub-id pub-id-type="doi">10.1016/j.jns.2018.11.028</pub-id><pub-id pub-id-type="medline">30504066</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liao</surname><given-names>L</given-names></name><name name-style="western"><surname>Li</surname><given-names>H</given-names></name><name name-style="western"><surname>Shang</surname><given-names>W</given-names></name><name name-style="western"><surname>Ma</surname><given-names>L</given-names></name></person-group><article-title>An empirical study of the impact of hyperparameter tuning and model optimization on the performance properties of deep neural networks</article-title><source>ACM Trans Softw Eng Methodol</source><year>2022</year><month>04</month><day>9</day><volume>31</volume><issue>3</issue><fpage>1</fpage><lpage>40</lpage><pub-id pub-id-type="doi">10.1145/3506695</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wong</surname><given-names>J</given-names></name><name name-style="western"><surname>Manderson</surname><given-names>T</given-names></name><name name-style="western"><surname>Abrahamowicz</surname><given-names>M</given-names></name><name name-style="western"><surname>Buckeridge</surname><given-names>DL</given-names></name><name name-style="western"><surname>Tamblyn</surname><given-names>R</given-names></name></person-group><article-title>Can hyperparameter tuning improve the performance of a super learner? a case study</article-title><source>Epidemiology</source><year>2019</year><month>07</month><volume>30</volume><issue>4</issue><fpage>521</fpage><lpage>531</lpage><pub-id pub-id-type="doi">10.1097/EDE.0000000000001027</pub-id><pub-id pub-id-type="medline">30985529</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>P</given-names></name><name name-style="western"><surname>Han</surname><given-names>K</given-names></name><name name-style="western"><surname>Wei</surname><given-names>XS</given-names></name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names></name><name name-style="western"><surname>Wang</surname><given-names>L</given-names></name></person-group><article-title>Contrastive learning based hybrid networks for long-tailed image classification</article-title><conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 20 to 25, 2021</conf-date><conf-loc>Nashville, TN</conf-loc><pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00100</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names></name><name name-style="western"><surname>Li</surname><given-names>W</given-names></name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names></name></person-group><article-title>Memory-based jitter: improving visual recognition on long-tailed data with diversity in memory</article-title><source>Proc AAAI Conf Artif Intell</source><year>2022</year><month>06</month><day>28</day><volume>36</volume><issue>2</issue><fpage>1720</fpage><lpage>1728</lpage><pub-id pub-id-type="doi">10.1609/aaai.v36i2.20064</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Xia</surname><given-names>R</given-names></name><name name-style="western"><surname>Ding</surname><given-names>Z</given-names></name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Korhonen</surname><given-names>A</given-names></name><name name-style="western"><surname>Traum</surname><given-names>D</given-names></name><name name-style="western"><surname>M&#x00E1;rquez</surname><given-names>L</given-names></name></person-group><article-title>Emotion-cause pair extraction: a new task to emotion analysis in texts</article-title><source>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</source><year>2019</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>1003</fpage><lpage>1012</lpage><pub-id pub-id-type="doi">10.18653/v1/P19-1096</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>King</surname><given-names>RD</given-names></name><name name-style="western"><surname>Orhobor</surname><given-names>OI</given-names></name><name name-style="western"><surname>Taylor</surname><given-names>CC</given-names></name></person-group><article-title>Cross-validation is safe to use</article-title><source>Nat Mach Intell</source><year>2021</year><month>04</month><day>20</day><volume>3</volume><issue>4</issue><fpage>276</fpage><pub-id pub-id-type="doi">10.1038/s42256-021-00332-z</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Customized questionnaire.</p><media xlink:href="medinform_v12i1e50117_app1.pdf" xlink:title="PDF File, 74 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>List of included studies.</p><media xlink:href="medinform_v12i1e50117_app2.pdf" xlink:title="PDF File, 171 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Graphical representations of data.</p><media xlink:href="medinform_v12i1e50117_app3.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app4"><label>Checklist 1</label><p>PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) checklist.</p><media xlink:href="medinform_v12i1e50117_app4.pdf" xlink:title="PDF File, 83 KB"/></supplementary-material></app-group></back></article>