<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v14i1e77561</article-id>
      <article-id pub-id-type="pmid">41539675</article-id>
      <article-id pub-id-type="doi">10.2196/77561</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Prompting and Fine-Tuning Large Language Models for Parkinson Disease Diagnosis: Comparative Evaluation Study Using the PPMI Structured Dataset</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hungbo</surname>
            <given-names>Akonasu</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Acharya</surname>
            <given-names>Nirajan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Shin</surname>
            <given-names>Hyun-Ji</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5239-8979</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Jeong</surname>
            <given-names>Young Jin</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7611-8185</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Jun</surname>
            <given-names>Sungmin</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0838-9236</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Kang</surname>
            <given-names>Do-Young</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <address>
            <institution>Department of Nuclear Medicine</institution>
            <institution>College of Medicine</institution>
            <institution>Dong-A University</institution>
            <addr-line>26, Daesingongwon-ro, Seo-gu</addr-line>
            <addr-line>Busan, 49201</addr-line>
            <country>Republic of Korea</country>
            <phone>82  51 240 5630</phone>
            <email>dykang@dau.ac.kr</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1688-0818</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Data Sciences Convergence</institution>
        <institution>Graduate School</institution>
        <institution>Dong-A University</institution>
        <addr-line>Busan</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Institute of Convergence Bio-Health</institution>
        <institution>Dong-A University</institution>
        <addr-line>Busan</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Nuclear Medicine</institution>
        <institution>College of Medicine</institution>
        <institution>Dong-A University</institution>
        <addr-line>Busan</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Nuclear Medicine</institution>
        <institution>Dong-A University Hospital</institution>
        <addr-line>Busan</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Do-Young Kang <email>dykang@dau.ac.kr</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>15</day>
        <month>1</month>
        <year>2026</year>
      </pub-date>
      <volume>14</volume>
      <elocation-id>e77561</elocation-id>
      <history>
        <date date-type="received">
          <day>15</day>
          <month>5</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>22</day>
          <month>5</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>24</day>
          <month>12</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>24</day>
          <month>12</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Hyun-Ji Shin, Young Jin Jeong, Sungmin Jun, Do-Young Kang. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 15.01.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2026/1/e77561" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Parkinson disease (PD) presents diagnostic challenges due to its heterogeneous motor and nonmotor manifestations. Traditional machine learning (ML) approaches have been evaluated on structured clinical variables. However, the diagnostic utility of large language models (LLMs) using natural language representations of structured clinical data remains underexplored.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to evaluate the diagnostic classification performance of multiple LLMs using natural language prompts derived from structured clinical data and to compare their performance with traditional ML baselines.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We reformatted structured clinical variables from the Parkinson’s Progression Markers Initiative (PPMI) dataset into natural language prompts and used them as inputs for several LLMs. Variables with high multicollinearity were removed, and the top 10 features were selected using Shapley additive explanations (SHAP)–based feature ranking. LLM performance was examined across few-shot prompting, dual-output prompting that additionally generated post hoc explanatory text as an exploratory component, and supervised fine-tuning. Logistic regression (LR) and support vector machine (SVM) classifiers served as ML baselines. Model performance was evaluated using <italic>F</italic><sub>1</sub>-scores on both the test set and a temporally independent validation set (temporal validation set) of limited size, and repeated output generation was carried out to assess stability.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>On the test set of 122 participants, LR and SVM trained on the 10 SHAP-selected clinical variables each achieved a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.960 (accuracy 0.975). LLMs receiving natural language prompts derived from the same variables reached comparable performance, with the best few-shot configurations achieving macro-averaged <italic>F</italic><sub>1</sub>-scores of 0.987 (accuracy 0.992). In the temporal validation set of 31 participants, LR maintained a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.903, whereas SVM showed substantial performance degradation. In contrast, multiple LLMs sustained high diagnostic performance, reaching macro-averaged <italic>F</italic><sub>1</sub>-scores up to 0.968 and high recall for PD. Repeated output generation across LLM conditions produced generally stable predictions, with rare variability observed across runs. Under dual-output prompting, diagnostic performance showed a reduction relative to few-shot prompting while remaining generally stable. Supervised fine-tuning of lightweight models improved stability and enabled GPT-4o-mini to achieve a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.987 on the test set, with uniformly correct predictions observed in the small temporal validation set, which should be interpreted cautiously given the limited sample size and exploratory nature of the evaluation.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study provides an exploratory benchmark of how modern LLMs process structured clinical variables in natural language form. While several models achieved diagnostic performance comparable to LR across both the test and temporal validation datasets, their outputs were sensitive to prompting formats, model choice, and class distributions. Occasional variability across repeated output generations reflected the stochastic nature of LLMs, and lightweight models required supervised fine-tuning for stable generalization. These findings highlight the capabilities and limitations of current LLMs in handling tabular clinical information and underscore the need for cautious application and further investigation.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>Claude</kwd>
        <kwd>diagnostic classification</kwd>
        <kwd>fine-tuning</kwd>
        <kwd>Gemini</kwd>
        <kwd>GPT</kwd>
        <kwd>large language models</kwd>
        <kwd>LLaMA</kwd>
        <kwd>Parkinson disease</kwd>
        <kwd>Parkinson’s Progression Markers Initiative</kwd>
        <kwd>PPMI</kwd>
        <kwd>prompt engineering</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Parkinson disease (PD) is the second most common neurodegenerative disorder and presents substantial diagnostic challenges due to the heterogeneity of its motor and nonmotor symptoms [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Despite sustained research efforts, early diagnostic accuracy remains limited, and the variability of clinical presentations continues to complicate reliable classification [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Large-scale initiatives such as the Parkinson’s Progression Markers Initiative (PPMI) [<xref ref-type="bibr" rid="ref6">6</xref>] have facilitated systematic evaluation of clinical, imaging, and biomarker variables, providing a foundation for quantitative approaches to PD diagnosis.</p>
      <p>Machine learning (ML) models, including logistic regression (LR) [<xref ref-type="bibr" rid="ref7">7</xref>], support vector machines (SVM) [<xref ref-type="bibr" rid="ref8">8</xref>], and tree-based classifiers, have demonstrated strong performance when applied to structured clinical variables [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. However, these models operate on fixed feature representations and do not naturally support flexible natural language inputs or generative reasoning, limiting their applicability when structured clinical data must be reformatted into descriptive text.</p>
      <p>Recent large language models (LLMs) exhibit strong capabilities in processing natural language information and have shown promise in clinical applications involving unstructured text such as clinical notes and reports [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. LLM-generated rationales typically represent post hoc explanatory text rather than true interpretability [<xref ref-type="bibr" rid="ref14">14</xref>], and prior studies have reported challenges such as prompt sensitivity, stochastic variability, and susceptibility to distributional shifts when LLMs are applied to medical tasks [<xref ref-type="bibr" rid="ref15">15</xref>]. However, their ability to perform diagnostic classification when structured clinical variables are reformatted into natural language prompts remains insufficiently explored, and recent work evaluating the performance of LLMs on structured or tabular data has reported limited and inconsistent results [<xref ref-type="bibr" rid="ref16">16</xref>]. These characteristics highlight the need for careful and systematic evaluation when applying LLMs to structured clinical information.</p>
      <p>In this context, the goal of this study was to conduct an exploratory benchmark of modern LLMs for PD classification using natural language prompts derived from structured clinical variables in the PPMI dataset. We compared multiple LLM families with traditional ML baselines and evaluated their behavior across few-shot prompting, dual-output prompting, and supervised fine-tuning. The study aimed to characterize both the capabilities and the limitations of LLMs in handling tabular clinical information, rather than demonstrating superiority over conventional ML approaches.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates the overall methodological workflow of the study, detailing the progression from dataset selection and feature preprocessing to prompt construction, model training, and evaluation. It provides a concise visual summary of the experimental pipeline described in the Methods section.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Study design overview illustrating the overall experimental pipeline, including dataset selection, prompting strategies, fine-tuning, and evaluation procedures. HC: healthy controls; LLM: large language model; MD: markdown; MD+ST: markdown with special token; ML: machine learning; PD: Parkinson disease; PPMI: Parkinson’s Progression Markers Initiative; PT: plain text; SHAP: Shapley additive explanations; ST: special token; VIF: variance inflation factor.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Dataset and Preprocessing</title>
        <p>This study used a curated dataset provided by the PPMI, downloaded on July 29, 2024 (data release: July 3, 2024). The initial dataset consisted of 4 participant cohorts: individuals with PD, healthy controls (HC), participants with scans without evidence of dopaminergic deficit, and prodromal participants. For this study, only PD and HC participants were included.</p>
        <p>Duplicate observations were removed by retaining the most comprehensive examination timepoint for each participant. Variables with more than 20% of missingness were excluded. Missing values in the remaining viables were imputed using mean imputation for numerical variables and mode imputation for categorical variables [<xref ref-type="bibr" rid="ref17">17</xref>]. Participants included in the held-out test set and in the temporally independent validation set (temporal validation set) did not contain missing values and therefore required no imputation. Missing-value imputation was performed only for the training portion of the development dataset, which consisted of the train and validation subsets. Imputation statistics (mean and mode) were computed exclusively from the training split and subsequently applied to validation split. This procedure ensured that no information from the test or temporal validation sets influenced the preprocessing of the training data. After preprocessing, the final development dataset consisted of 1360 participants (PD, n=1063; HC, n=297).</p>
        <p>A temporal validation set was constructed using the most recent curated PPMI dataset released on December 11, 2024. To ensure temporal separation, all data contained in the July 3, 2024, release were excluded, and only data from participants newly recruited after this release were included. This temporally separated dataset was used to assess the generalizability of model performance to data collected at a later timepoint. In clinical research, obtaining an external dataset with identical conditions is often challenging, and temporally separated datasets serve as a practical and widely accepted alternative for external validation [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>Demographic characteristics of both the development and temporal validation datasets, including age, sex, education, and race, are summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Demographic information of participants by dataset group.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="250"/>
            <col width="180"/>
            <col width="180"/>
            <col width="180"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Characteristic</td>
                <td colspan="2">Development set</td>
                <td colspan="2">Temporal validation set<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>PD<sup>b</sup> (n=1063)</td>
                <td>HC<sup>c</sup> (n=297)</td>
                <td>PD (n=15)</td>
                <td>HC (n=16)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">Age (years), mean (SD; range)</td>
                <td>63.3 (9.7; 30.7-85.3)</td>
                <td>62.4 (10.6; 30.4-83.7)</td>
                <td>62.7 (10.9; 31.3-73.0)</td>
                <td>63.9 (11.3; 30.4-79.4)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Sex, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Male</td>
                <td>686 (64.5)</td>
                <td>195 (65.7)</td>
                <td>8 (53.3)</td>
                <td>3 (18.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Female</td>
                <td>377 (35.5)</td>
                <td>102 (34.3)</td>
                <td>7 (46.7)</td>
                <td>13 (81.3)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Education (years), mean (SD; range)</td>
                <td>16.0 (2.8; 6-20)</td>
                <td>16.0 (2.7; 8-20)</td>
                <td>16.8 (2.0; 11-20)</td>
                <td>16.7 (2.0; 12-20)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Race, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>White</td>
                <td>973 (91.5)</td>
                <td>272 (91.6)</td>
                <td>13 (86.7)</td>
                <td>16 (100)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Black</td>
                <td>30 (2.8)</td>
                <td>10 (3.4)</td>
                <td>0 (0)</td>
                <td>0 (0)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Asian</td>
                <td>21 (2)</td>
                <td>2 (0.7)</td>
                <td>1 (6.7)</td>
                <td>0 (0)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Other</td>
                <td>39 (3.7)</td>
                <td>13 (4.4)</td>
                <td>1 (6.7)</td>
                <td>0 (0)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Temporal validation set includes participants newly enrolled after the July 3, 2024, data release (excluded from the development set) and serves as a temporally separated dataset for external validation.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>HC: healthy controls.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The development dataset was randomly divided into training, validation, and test subsets following an approximately 7:2:1 ratio. Only complete cases were included in the test set to ensure reliable evaluation. Feature selection and fine-tuning were performed exclusively using the training set. The test set was used solely for final model evaluation and was not involved in any feature selection or model fitting procedures. Class distributions for each subset are summarized in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Dataset splits and class distribution for diagnostic modeling.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Datasets and subsets</td>
                <td>Total, n</td>
                <td>PD<sup>a</sup>, n (%)</td>
                <td>HC<sup>b</sup>, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="5">
                  <bold>Development set</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Training set</td>
                <td>990</td>
                <td>771 (77.9)</td>
                <td>219 (22.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Validation set</td>
                <td>248</td>
                <td>193 (77.8)</td>
                <td>55 (22.2)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Test set</td>
                <td>122</td>
                <td>99 (81.1)</td>
                <td>23 (18.9)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Temporal validation set</td>
                <td>31</td>
                <td>15 (48.4)</td>
                <td>16 (51.6)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>HC: healthy controls.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Feature Selection</title>
        <sec>
          <title>Multicollinearity Evaluation</title>
          <p>To address multicollinearity, we generated a correlation matrix and computed the variance inflation factor for all candidate variables [<xref ref-type="bibr" rid="ref20">20</xref>]. Variables with variance inflation factor values exceeding 10, a commonly accepted threshold for severe multicollinearity, were removed to prevent unstable coefficients and improve model reliability [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. A total of 70 variables remained after this screening step and were used for Shapley additive explanations (SHAP)–based feature selection.</p>
        </sec>
        <sec>
          <title>SHAP-Based Feature Importance</title>
          <p>Feature importance was evaluated using a weighted average of SHAP values obtained from 4 tree-based models, including random forest [<xref ref-type="bibr" rid="ref23">23</xref>], XGBoost [<xref ref-type="bibr" rid="ref24">24</xref>], LightGBM [<xref ref-type="bibr" rid="ref25">25</xref>], and CatBoost [<xref ref-type="bibr" rid="ref26">26</xref>]. These models were selected because they are widely used for their tabular data and are known to mitigate issues related to multicollinearity while accommodating mixtures of numerical and categorical features effectively [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
          <p>For each model, SHAP values were computed for all remaining variables, normalized, and then combined using a weighted averaging approach based on the diagnostic performance of each model. The performance metrics and assigned weights are summarized in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          <p>The weighted average SHAP value for each feature was calculated as [<xref ref-type="bibr" rid="ref28">28</xref>]:</p>
          <p>
            <graphic xlink:href="medinform_v14i1e77561_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </p>
          <p>where:</p>
          <list list-type="bullet">
            <list-item>
              <p><italic>ϕ<sub>ij</sub></italic> represents the SHAP value of feature <italic>j</italic> computed by model <italic>i</italic></p>
            </list-item>
            <list-item>
              <p><italic>w<sub>i</sub></italic> represents the performance-based weight assigned to model <italic>i</italic></p>
            </list-item>
            <list-item>
              <p>the denominator <inline-graphic xlink:href="medinform_v14i1e77561_fig11.png" xlink:type="simple" mimetype="image"/>
ensures that weights are normalized so they sum to 1.</p>
            </list-item>
          </list>
          <p>The top 10 variables were selected based on these weighted average SHAP values, as shown in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. The final features set consisted of clinically meaningful motor, olfactory, and imaging features. Specifically, the selected variables were updrs3_score, con_putamen, updrs_totscore, updrs2_score, lowput_expected, upsit_pctl, DATSCAN_PUTAMEN_L, con_striatum, mean_putamen, and DATSCAN_PUTAMEN_R. These features were used consistently across all subsequent ML and LLM experiments.</p>
        </sec>
      </sec>
      <sec>
        <title>Prompt Construction</title>
        <p>To evaluate the performance of LLMs, we developed 4 prompting formats that express input variables in plain text (PT), markdown (MD), special token (ST) annotations, and a combined markdown with special token (MD+ST) structure [<xref ref-type="bibr" rid="ref29">29</xref>]. The PT format presents the input features as natural language sentences, whereas the MD format uses a structured table to delineate characteristics and enhance feature recognition, and the combined MD+ST format integrates both structured layout and explicit ST annotation.</p>
        <p>Each prompting format was applied under 0-shot conditions to 3-shot conditions. All few-shot settings incorporated balanced PD and HC examples, with 1 pair for 1-shot, 2 pairs for 2-shot, and 3 pairs for 3-shot prompting. For dual-output prompting, the diagnostic label was followed by 3 sentences of post hoc explanatory text, which served as an exploratory output and did not represent true model interpretability [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
        <p>For dual-output prompting, each LLM was instructed to generate a diagnostic label (PD or HC) followed by 3 sentences of post hoc explanatory text with a maximum length of 180 tokens to ensure comparability across models. Examples of all prompting formats, few-shot configurations, and sample outputs are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
      </sec>
      <sec>
        <title>Modeling Approaches</title>
        <p>This study compared traditional ML classifiers with multiple LLMs in order to evaluate diagnostic performance under consistent experimental conditions. As deterministic baselines, we trained LR with L2 regularization and a SVM with a radial basis function (RBF) kernel, using the top 10 SHAP-selected features. Both models were implemented using standard scikit-learn procedures without extensive hyperparameter tuning, since the goal was to establish transparent and reproducible baselines rather than to maximize predictive performance. The complete hyperparameter configurations for the ML classifiers are summarized in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. These models were evaluated following the procedures described in the “Training and Evaluation Settings” section.</p>
        <p>For LLM-based classification, we selected model families that collectively capture diversity in architecture, accessibility, and usage constraints. The included families were LLaMA, GPT, Gemini, and Claude. Each family provided one or more model variants with different parameter sizes or intended applications. The LLaMA models consisted of LLaMA 3.1 8B Instruct and LLaMA 3.3 70B Instruct, both of which were run locally using open-access model weights. The GPT models consisted of GPT-4o-mini and GPT-4o, accessed through the OpenAI application programming interface (API). The Gemini models included Gemini 1.5 Flash and Gemini 1.5 Pro, accessed through the Google Gemini API. For the Claude family, the Claude 3.5 Sonnet model was used. Although lightweight variants such as Claude Instant were available, the Sonnet model was selected due to budget considerations and its expected performance.</p>
        <p>All LLMs were accessed through their officially supported APIs or locally hosted implementations using Python-based programmatic interfaces. The same prompting formats, shot settings, and evaluation procedures were applied across all models, except where input-format restrictions required specific handling. Additional implementation details, including software environments and API configurations, are provided in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p>
      </sec>
      <sec>
        <title>Training and Evaluation Settings</title>
        <sec>
          <title>Overview</title>
          <p>The evaluation schedule and prompting phases for each model are summarized in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>, and the hardware and software environments used in all experiments are described in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>. To address the class imbalance between PD and HC, model performance was primarily assessed using the <italic>F</italic><sub>1</sub>-score, with precision and recall reported as supporting metrics. Accuracy was treated as a secondary measure, given that <italic>F</italic><sub>1</sub>-score provides a more balanced assessment under imbalanced classification tasks [<xref ref-type="bibr" rid="ref31">31</xref>].</p>
          <p>For all binary classification tasks, PD was assigned as the positive class (label=1) and HC as the negative class (label=0) across all ML and LLM experiments. Numerical features were standardized using <italic>z</italic> score scaling, where each variable was transformed by subtracting the mean and dividing by the SD computed from the training split. This ensured a consistent feature scale for both LR and SVM while preventing data leakage. Standardization using <italic>z</italic> score was applied exclusively to the ML baselines. For LLM-based experiments, models were provided with the original clinical values embedded in natural language prompts, rather than normalized numerical vectors. For ML models, performance uncertainty was quantified using 95% bootstrap CIs based on 1000 resampling iterations [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
          <p>For LLMs, diagnostic classification was evaluated under 0-shot settings to 3-shot settings using 4 prompting formats including PT, MD, ST, and MD+ST. All few-shot experiments incorporated balanced PD and HC examples at each shot level. Across all LLM conditions, each participant was evaluated through 30 repeated model executions under identical prompting conditions to account for the inherent nondeterminism of LLM outputs [<xref ref-type="bibr" rid="ref33">33</xref>]. Any change in diagnostic labels across these executions was recorded as a label inconsistency event. For dual-output prompting experiments, semantic consistency of the generated post hoc explanatory texts was also evaluated.</p>
          <p>For each LLM and prompting configuration, macro-averaged performance metrics (<italic>F</italic><sub>1</sub>-score, precision, recall, and accuracy) were computed for each of the 30 model executions. Nonparametric 95% CIs for LLM performance were estimated only for the temporal validation set using a hierarchical bootstrap procedure. In each iteration, participants were sampled with replacement and, for each selected participant, one prediction was randomly drawn from the 30 repeated trials. The 2.5th and 97.5th percentiles of the resulting performance distributions were reported in the temporal validation tables.</p>
          <p>All combinations of prompting format and shot number were evaluated for each LLM on the development test set. The prompting configuration reported as the best for each model corresponds to the combination that achieved the highest macro-averaged <italic>F</italic><sub>1</sub>-score on this test set. During temporal validation, each LLM was evaluated using only under its best-performing configuration. This ensured that temporal validation assessed the generalizability of each model’s optimal setting rather than retesting all prompt-shot combinations on temporally independent data.</p>
        </sec>
        <sec>
          <title>ML Baseline Training Protocols</title>
          <p>Two conventional ML classifiers were trained as deterministic baselines for comparison with LLM-based diagnostic approaches. The models were LR with L2 regularization and a SVM with an RBF kernel, both of which were trained using the same feature subset identified in the SHAP-based feature selection step. Class weights were applied to address the imbalance between PD and HC samples in the development set.</p>
          <p>Both models were implemented using standard scikit-learn procedures without extensive hyperparameters optimization, as the primary goal was to establish transparent and reproducible baselines rather than to maximize predictive performance. LR was trained using the liblinear solver with balanced class weights, while SVM used an RBF kernel with probability estimation enabled. Performance for both models was evaluated for direct comparability with all LLM-based experiments.</p>
        </sec>
        <sec>
          <title>Few-Shot Prompting Evaluation of LLMs</title>
          <p>The LLMs were evaluated using the 4 prompting formats described earlier, applied across 0-, 1-, 2-, and 3-shot settings. Examples of PD and HC were balanced in all few-shot conditions to ensure consistent contextual exposure. All 4 prompting formats were applied to the LLaMA models. In contrast, only the PT and MD formats were applied to the GPT, Gemini, and Claude models because these API-based models do not support ST annotations in their input structure. Importantly, the inclusion of STs substantially influenced both diagnostic performance and output stability. Accordingly, ST-based prompting (including MD+ST) was applied exclusively to the LLaMA models as a model-specific input design choice, and the resulting differences in prompt structuring capability were explicitly considered as a limitation in cross-model comparisons.</p>
        </sec>
        <sec>
          <title>Dual-Output Prompting of LLMs for Diagnosis and Post Hoc Explanatory Text</title>
          <p>This experiment examined whether requiring LLMs to generate post hoc explanatory text could influence their diagnostic classification performance. Recent studies have shown that step-by-step prompting can improve LLM accuracy in complex decision tasks [<xref ref-type="bibr" rid="ref34">34</xref>], yet fully stepwise reasoning is often impractical in clinical contexts due to token limitations. Rather than instructing the models to articulate their reasoning step by step, the models were prompted to produce concise post hoc explanatory text. This approach enabled assessment of whether a reduced explanatory demand could affect the diagnostic output.</p>
          <p>Each model generated a binary diagnosis of PD or HC followed by exactly 3 sentences of post hoc explanatory text. To ensure consistency across models and to manage token usage, the length of these explanatory outputs was fixed. This dual-output prompting was applied to 4 models, including GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Pro, and LLaMA 3.3 70B. Among these, the LLaMA model was prompted using the ST format, whereas the other models were prompted using the PT format because these models do not support ST inputs.</p>
          <p>Although the generated explanations were not reviewed by clinical experts, their semantic consistency was evaluated to assess the stability of the post hoc explanatory text under repeated prompting. High semantic consistency reflects only the reproducibility of the generated text across repeated runs and does not imply that the explanations are clinically accurate or factually grounded. For each participant, their explanatory outputs per model were collected, and pairwise cosine similarity between sentence embeddings was computed using the all-mpnet-base-v2 model from the Sentence-Transformers library [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. This semantic consistency analysis was performed only on the development set, while the temporal validation set was used exclusively for the primary diagnostic evaluation.</p>
        </sec>
        <sec>
          <title>LLM Supervised Fine-Tuning</title>
          <p>Supervised fine-tuning was conducted to evaluate whether labeled training data could improve the diagnostic performance and output stability of lightweight LLMs. Unlike prompt-based learning, which is constrained by the number of examples that can be included within a single prompt, supervised fine-tuning allows the model to learn diagnostic patterns directly from the full training dataset.</p>
          <p>Among the models tested in this study, GPT-4o-mini and Gemini 1.5 Flash were selected as compact and computationally efficient alternatives to assess how smaller LLMs perform relative to larger models. Both models were fine-tuned using the same dataset that was used in the prompting experiments, and all evaluations were performed under 0-shot conditions on the identical test set. The generation temperature was fixed at 0.1 to ensure consistent and deterministic behavior. Further details regarding fine-tuning platforms, sampling strategies, and training parameters are provided in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>.</p>
          <p>The GPT-4o-mini model was fine-tuned through the OpenAI API. Training jobs were submitted from a local workstation after converting the dataset into JSONL format. A total of 1052 samples were used for training and 186 samples for validation. The fine-tuning process followed OpenAI’s job-based workflow, in which each training job is versioned and automatically made accessible through the OpenAI API upon completion.</p>
          <p>The Gemini 1.5 Flash model was fine-tuned using the Google AI Studio platform. Because the platform restricts training to a maximum of 500 samples, stratified sampling was applied to preserve the original class ratio, resulting in 383 PD samples and 117 HC samples. The model was trained for 1 epoch with a batch size of 1 and a learning rate of 1, after which the fine-tuned model was deployed via the Gemini API using the assigned model identifier.</p>
          <p>All fine-tuned models were trained and evaluated using the PT prompt format to maintain consistency between training and testing conditions.</p>
        </sec>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study used publicly available, deidentified data provided by PPMI. According to institutional policies and the PPMI Data Use Agreement, additional institutional review board approval was not required for secondary analyses of this publicly accessible dataset. The original PPMI study was conducted under institutional review board approval at all participating institutions, and all participants provided written informed consent permitting data sharing and secondary analyses. All analyses in this study were conducted on anonymized records with no personally identifiable information, and no participant compensation was applicable.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>ML Baseline Performance</title>
        <p>To establish a deterministic baseline for comparison with LLM-based diagnostic approaches, we evaluated 2 conventional ML classifiers trained on the top 10 SHAP-selected features. LR and a SVM with an RBF kernel were implemented using standard scikit-learn procedures.</p>
        <p>As summarized in <xref ref-type="table" rid="table3">Table 3</xref>, both models achieved identical performance on the test subset of the development set, yielding a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.960 and an accuracy of 0.975. Precision and recall were balanced across PD and HC, and the 95% CIs ranged from approximately 0.94 to 1.00, reflecting the modest sample size of the test subset (n=122).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Performance of machine learning models on the test subset of the development set (n=122; CIs were estimated via 1000 bootstrap iterations).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="220"/>
            <col width="240"/>
            <col width="240"/>
            <col width="130"/>
            <thead>
              <tr valign="bottom">
                <td>Model</td>
                <td><italic>F</italic><sub>1</sub>-score<sup>a</sup> (95% CI)</td>
                <td>Precision (macro avg<sup>b</sup>/PD<sup>c</sup>/HC<sup>d</sup>)</td>
                <td>Recall (macro avg/PD/HC)</td>
                <td>Accuracy</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LR_L2<sup>e</sup></td>
                <td>0.960 (0.943-1.000)</td>
                <td>0.953/0.990/0.917</td>
                <td>0.968/0.980/0.957</td>
                <td>0.975</td>
              </tr>
              <tr valign="top">
                <td>SVM_RBF<sup>f</sup></td>
                <td>0.960 (0.944-1.000)</td>
                <td>0.953/0.990/0.917</td>
                <td>0.968/0.980/0.957</td>
                <td>0.975</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup><italic>F</italic><sub>1</sub>-scores represent macro avg values across PD and HC classes.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>macro avg: macro-averaged.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>HC: healthy controls.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>LR_L2: logistic regression (L2 regularization).</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>SVM_RBF: support vector machine (RBF kernel).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p><xref ref-type="table" rid="table4">Table 4</xref> presents the results from the temporal validation set (n=31). LR demonstrated moderately high generalizability, achieving a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.903 and an accuracy of 0.903. In contrast, SVM showed a substantial decline in generalizability, producing a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.484 with markedly reduced recall for PD. The wide CIs observed for both models reflect the limited size of the temporal validation set and the inherent uncertainty associated with evaluating performance on small external samples.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Performance of machine learning models on the temporal validation set (n=31; CIs were estimated via 1000 bootstrap iterations).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="220"/>
            <col width="240"/>
            <col width="240"/>
            <col width="130"/>
            <thead>
              <tr valign="bottom">
                <td>Model</td>
                <td><italic>F</italic>1-score<sup>a</sup> (95% CI)</td>
                <td>Precision (macro avg<sup>b</sup>/PD<sup>c</sup>/HC<sup>d</sup>)</td>
                <td>Recall (macro avg/PD/HC)</td>
                <td>Accuracy</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LR_L2<sup>e</sup></td>
                <td>0.903 (0.791-1.000)</td>
                <td>0.917/0.833/1.0</td>
                <td>0.906/1.0/0.813</td>
                <td>0.903</td>
              </tr>
              <tr valign="top">
                <td>SVM_RBF<sup>f</sup></td>
                <td>0.484 (0.157-0.506)</td>
                <td>0.241/0.483/0</td>
                <td>0.5/0/1.0</td>
                <td>0.484</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup><italic>F</italic><sub>1</sub>-scores represent macro avg values across PD and HC classes.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>macro avg: macro-averaged.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>HC: healthy controls.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>LR_L2: logistic regression (L2 regularization).</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>SVM_RBF: support vector machine (RBF kernel).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Given the small size of the temporal validation set (n=31), results obtained on this dataset should be interpreted with caution, as performance estimates may be sensitive to individual misclassifications.</p>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> illustrates the prediction patterns for the 2 classifiers across both datasets. On the development test subset (<xref rid="figure2" ref-type="fig">Figure 2</xref>A), both models produced identical predictions, misclassifying 1 HC participant as PD and misclassifying 2 PD participants as HC. In the temporal validation set, LR misclassified only 3 HC cases as PD (<xref rid="figure2" ref-type="fig">Figure 2</xref>B), whereas SVM misclassified nearly all HC cases (<xref rid="figure2" ref-type="fig">Figure 2</xref>C). Although the two models performed similarly on the development test subset, their stability diverged substantially when evaluated on data collected at a later timepoint.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Confusion matrix for machine learning classifiers. (A) Test subset of the development set. (B) Logistic regression of temporal validation set. (C) Support vector machine of temporal validation set. HC: healthy controls; PD: Parkinson disease.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Few-Shot Prompting Performance of LLMs</title>
        <p>A total of 4 prompting formats, including PT, MD, ST, and MD+ST, were evaluated under 0-shot conditions to 3-shot conditions to assess the diagnostic performance of LLMs. ST-based prompts were applied exclusively to the LLaMA models because the inclusion of STs substantially affected both classification accuracy and output stability. Non-LLaMA models such as GPT, Gemini, and Claude did not support ST formatting and were therefore evaluated using PT and MD formats.</p>
        <p><xref ref-type="table" rid="table5">Table 5</xref> presents the best-performing results for each model on the test dataset (n=122). The best-performing configuration for each model corresponds to the prompt type and shot number that achieved the highest macro-averaged <italic>F</italic><sub>1</sub>-score on the development test set.</p>
        <p>LLaMA 3.3 70B and Gemini 1.5 Pro achieved the highest macro-averaged <italic>F</italic><sub>1</sub>-score of 0.987 with an accuracy of 0.992 and stable predictions across 30 repeated trials. LLaMA 3.1 8B reached the same <italic>F</italic><sub>1</sub>-score using the MD+ST prompt at the 3-shot setting but showed one instance of prediction inconsistency. Claude 3.5 Sonnet demonstrated strong performance with an <italic>F</italic><sub>1</sub>-score of 0.972 and an accuracy of 0.984 under 1-shot MD setting, maintaining complete consistency across repeated runs. GPT-4o and GPT-4o-mini achieved slightly lower but stable performance, with macro-averaged <italic>F</italic><sub>1</sub>-scores of 0.961 and 0.910, respectively. Gemini 1.5 Flash produced the lowest <italic>F</italic><sub>1</sub>-score (0.894) despite consistent predictions, reflecting reduced sensitivity for HC cases. Detailed per-participant consistency analyses across 30 trials are provided in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>.</p>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref> displays the confusion matrices for the best-performing configuration of each model. Most classification errors occurred in HC samples, indicating high sensitivity for PD across the LLMs.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Best performance of large language models on the test dataset (n=122).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="100"/>
            <col width="70"/>
            <col width="90"/>
            <col width="160"/>
            <col width="160"/>
            <col width="90"/>
            <col width="120"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Prompt</td>
                <td>Shot</td>
                <td><italic>F</italic><sub>1</sub>-score<sup>a</sup></td>
                <td>Precision (macro avg<sup>b</sup>/PD<sup>c</sup>/HC<sup>d</sup>)</td>
                <td>Recall (macro avg/PD/HC)</td>
                <td>Accuracy</td>
                <td>Inconsistency<sup>e</sup>, n</td>
                <td>Inconsistent cases<sup>f</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LLaMA 3.1 8B</td>
                <td>MD+ST<sup>g</sup></td>
                <td>3</td>
                <td>0.987</td>
                <td>0.995/0.990/1</td>
                <td>0.978/1/0.957</td>
                <td>0.992</td>
                <td>1</td>
                <td>PD #73 (13:17)</td>
              </tr>
              <tr valign="top">
                <td>LLaMA 3.3 70B</td>
                <td>ST<sup>h</sup></td>
                <td>2</td>
                <td>0.987</td>
                <td>0.995/0.990/1</td>
                <td>0.978/1/0.957</td>
                <td>0.992</td>
                <td>0</td>
                <td>—<sup>i</sup></td>
              </tr>
              <tr valign="top">
                <td>GPT-4o-mini</td>
                <td>MD<sup>j</sup></td>
                <td>2</td>
                <td>0.910</td>
                <td>0.972/0.943/1</td>
                <td>0.869/1/0.739</td>
                <td>0.951</td>
                <td>1</td>
                <td>HC #103 (19:11)</td>
              </tr>
              <tr valign="top">
                <td>GPT-4o</td>
                <td>PT<sup>k</sup></td>
                <td>0</td>
                <td>0.961</td>
                <td>0.954/0.990/0.917</td>
                <td>0.969/0.980/0.957</td>
                <td>0.975</td>
                <td>2</td>
                <td>HC #26 (28:2); HC #68 (17:13)</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Flash</td>
                <td>PT, MD</td>
                <td>2</td>
                <td>0.894</td>
                <td>0.967/0.934/1</td>
                <td>0.848/1/0.696</td>
                <td>0.938</td>
                <td>0</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Pro</td>
                <td>PT, MD</td>
                <td>2</td>
                <td>0.987</td>
                <td>0.995/0.990/1</td>
                <td>0.978/1/0.957</td>
                <td>0.992</td>
                <td>0</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Claude 3.5 Sonnet</td>
                <td>MD</td>
                <td>1</td>
                <td>0.972</td>
                <td>0.990/0.980/1</td>
                <td>0.957/1/0.913</td>
                <td>0.984</td>
                <td>0</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup><italic>F</italic><sub>1</sub>-scores represent macro avg values across PD and HC classes.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>macro avg: macro-averaged.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>HC: healthy controls.</p>
            </fn>
            <fn id="table5fn5">
              <p><sup>e</sup>Number of participants (out of 122) whose predictions were inconsistent at least once across 30 repeated trials.</p>
            </fn>
            <fn id="table5fn6">
              <p><sup>f</sup>Example of inconsistent participants showing the final label (eg, PD #73) and the number of predicted labels across 30 runs (eg, 13:17 indicates 13 HC and 17 PD predictions).</p>
            </fn>
            <fn id="table5fn7">
              <p><sup>g</sup>MD+ST: markdown with special token.</p>
            </fn>
            <fn id="table5fn8">
              <p><sup>h</sup>ST: special token.</p>
            </fn>
            <fn id="table5fn9">
              <p><sup>i</sup>Not applicable.</p>
            </fn>
            <fn id="table5fn10">
              <p><sup>j</sup>MD: markdown.</p>
            </fn>
            <fn id="table5fn11">
              <p><sup>k</sup>PT: plain text.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Confusion matrices of large language models under the best-performing configurations on the test dataset (n=122). HC: healthy controls; PD: Parkinson disease.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p><xref ref-type="table" rid="table6">Table 6</xref> and <xref rid="figure4" ref-type="fig">Figure 4</xref> summarize the best-performing configuration of each LLM on the temporal validation set (n=31). LLaMA 3.3 70B achieved the highest macro-averaged <italic>F</italic><sub>1</sub>-score of 0.968 using the ST format at the 2-shot setting, with recall of 1.00 and consistent predictions across all participants. LLaMA 3.1 8B, Gemini 1.5 Pro, and Claude 3.5 Sonnet showed comparable performance, each achieving a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.936 with stable predictions. GPT-4o and Gemini 1.5 Flash achieved moderately high performance (<italic>F</italic><sub>1</sub>-score=0.903), with consistent predictions across all participants. In contrast, GPT-4o-mini achieved the lowest <italic>F</italic><sub>1</sub>-score (0.836), driven primarily by reduced recall for HC cases. Across all models, recall for PD remained consistently high (recall≥0.938), demonstrating strong sensitivity even when evaluated on data collected at a later timepoint. <xref rid="figure4" ref-type="fig">Figure 4</xref> shows the confusion matrices for each model under the prompt and shot condition listed in <xref ref-type="table" rid="table6">Table 6</xref>.</p>
        <p>These findings indicate that LLMs generally maintained high diagnostic accuracy and prediction stability under temporally separated data, supporting their potential generalizability beyond the development dataset.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Best performance of large language models on the temporal validation set (n=31).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="100"/>
            <col width="70"/>
            <col width="90"/>
            <col width="160"/>
            <col width="160"/>
            <col width="90"/>
            <col width="120"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Prompt</td>
                <td>Shot</td>
                <td><italic>F</italic><sub>1</sub>-score<sup>a</sup> (95% CI)</td>
                <td>Precision (macro avg<sup>b</sup>/PD<sup>c</sup>/HC<sup>d</sup>)</td>
                <td>Recall (macro avg/PD/HC)</td>
                <td>Accuracy</td>
                <td>Inconsistency<sup>e</sup>, n</td>
                <td>Inconsistent cases<sup>f</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LLaMA 3.1 8B</td>
                <td>MD+ST<sup>g</sup></td>
                <td>3</td>
                <td>0.936 (0.854-1.000)</td>
                <td>0.941/0.882/1</td>
                <td>0.938/1/0.875</td>
                <td>0.935</td>
                <td>2</td>
                <td>HC #13 (9:21); HC #23 (7:23)</td>
              </tr>
              <tr valign="top">
                <td>LLaMA 3.3 70B</td>
                <td>ST<sup>h</sup></td>
                <td>2</td>
                <td>0.968 (0.899-1.000)</td>
                <td>0.969/0.938/1</td>
                <td>0.969/1/0.938</td>
                <td>0.968</td>
                <td>0</td>
                <td>—<sup>i</sup></td>
              </tr>
              <tr valign="top">
                <td>GPT-4o-mini</td>
                <td>MD<sup>j</sup></td>
                <td>2</td>
                <td>0.836 (0.708-0.966)</td>
                <td>0.875/0.750/1</td>
                <td>0.844/1/0.688</td>
                <td>0.839</td>
                <td>2</td>
                <td>HC #10 (4:26); HC #28 (8:22)</td>
              </tr>
              <tr valign="top">
                <td>GPT-4o</td>
                <td>PT<sup>k</sup></td>
                <td>0</td>
                <td>0.903 (0.773-1.000)</td>
                <td>0.917/0.833/1</td>
                <td>0.906/1/0.813</td>
                <td>0.903</td>
                <td>2</td>
                <td>PD #1 (1:29); HC #10 (28:2)</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Flash</td>
                <td>PT</td>
                <td>2</td>
                <td>0.903 (0.773-1.000)</td>
                <td>0.917/0.833/1</td>
                <td>0.906/1/0.813</td>
                <td>0.903</td>
                <td>0</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Pro</td>
                <td>PT</td>
                <td>2</td>
                <td>0.936 (0.832-1.000)</td>
                <td>0.941/0.882/1</td>
                <td>0.938/1/0.875</td>
                <td>0.935</td>
                <td>0</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Claude 3.5 Sonnet</td>
                <td>MD</td>
                <td>1</td>
                <td>0.936 (0.832-1.000)</td>
                <td>0.941/0.882/1</td>
                <td>0.938/1/0.875</td>
                <td>0.935</td>
                <td>31</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup><italic>F</italic><sub>1</sub>-scores represent macro avg values across PD and HC classes.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>macro avg: macro-averaged.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table6fn4">
              <p><sup>d</sup>HC: healthy controls.</p>
            </fn>
            <fn id="table6fn5">
              <p><sup>e</sup>Number of participants (out of 31) whose predictions were inconsistent at least once across 30 repeated trials.</p>
            </fn>
            <fn id="table6fn6">
              <p><sup>f</sup>Example of inconsistent participants showing the final label (eg, HC #13) and the number of predicted labels across 30 runs (eg, 9:21 indicates 9 HC and 21 PD predictions).</p>
            </fn>
            <fn id="table6fn7">
              <p><sup>g</sup>MD+ST: markdown with special token.</p>
            </fn>
            <fn id="table6fn8">
              <p><sup>h</sup>ST: special token.</p>
            </fn>
            <fn id="table6fn9">
              <p><sup>i</sup>Not applicable.</p>
            </fn>
            <fn id="table6fn10">
              <p><sup>j</sup>MD: markdown.</p>
            </fn>
            <fn id="table6fn11">
              <p><sup>k</sup>PT: plain text.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Confusion matrices of large language models under their best-performing configurations on the temporal validation set (n=31). HC: healthy controls; PD: Parkinson disease.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Diagnostic Performance Under Dual-Output Prompting</title>
        <p>Dual-output prompting was used to evaluate whether requiring LLMs to generate post hoc explanatory text influenced their diagnostic reliability. This experimental setting included 4 representative models, namely LLaMA 3.3 70B, GPT-4o, Gemini 1.5 Pro, and Claude 3.5 Sonnet. Unlike diagnostic-only prompting, this setup instructed each model to output a binary diagnostic label followed by 3 sentences of post hoc explanatory text. Because the LLaMA models are highly sensitive to input formatting, dual-output prompts for LLaMA 3.3 70B were constructed using the ST format. GPT-4o, Gemini 1.5 Pro, and Claude 3.5 Sonnet do not support ST inputs, and therefore PT prompts were used for these models to maintain compatibility.</p>
        <p>As summarized in <xref ref-type="table" rid="table7">Table 7</xref>, all models achieved high diagnostic performance under dual-output prompting, although overall <italic>F</italic><sub>1</sub>-scores were slightly lower than those obtained under few-shot prompting. LLaMA 3.3 70B and Claude 3.5 Sonnet each achieved a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.972 and correctly identified all PD cases. GPT-4o and Gemini 1.5 Pro showed similar results and achieved <italic>F</italic><sub>1</sub>-scores of 0.958 with an accuracy of 0.975. Most prediction errors occurred in HC cases, whereas all PD samples were correctly classified. Instances of inconsistency across 30 repeated trials were rare, typically affecting no more than 2 participants for each model. The full repeated-trial results are presented in <xref ref-type="supplementary-material" rid="app10">Multimedia Appendix 10</xref>.</p>
        <p><xref rid="figure5" ref-type="fig">Figure 5</xref> displays the confusion matrices corresponding to each model’s best-performing dual-output configuration.</p>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Best performance of large language models on the test dataset under dual-output prompting (n=122).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="100"/>
            <col width="70"/>
            <col width="90"/>
            <col width="160"/>
            <col width="160"/>
            <col width="90"/>
            <col width="120"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Prompt</td>
                <td>Shot</td>
                <td><italic>F</italic><sub>1</sub>-score<sup>a</sup></td>
                <td>Precision (macro avg<sup>b</sup>/PD<sup>c</sup>/HC<sup>d</sup>)</td>
                <td>Recall (macro avg/PD/HC)</td>
                <td>Accuracy</td>
                <td>Inconsistency<sup>e</sup>, n</td>
                <td>Inconsistent cases<sup>f</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LLaMA 3.3 70B</td>
                <td>ST<sup>g</sup></td>
                <td>3</td>
                <td>0.972</td>
                <td>0.990/0.980/1</td>
                <td>0.957/1/0.913</td>
                <td>0.984</td>
                <td>122</td>
                <td>HC #23 (29:1); HC #68 (28:2)</td>
              </tr>
              <tr valign="top">
                <td>GPT-4o</td>
                <td>PT<sup>h</sup></td>
                <td>1</td>
                <td>0.958</td>
                <td>0.985/0.970/1</td>
                <td>0.935/1/0.870</td>
                <td>0.975</td>
                <td>122</td>
                <td>HC #26 (26:4); HC #77 (6:24)</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Pro</td>
                <td>PT</td>
                <td>3</td>
                <td>0.958</td>
                <td>0.985/0.970/1</td>
                <td>0.935/1/0.870</td>
                <td>0.975</td>
                <td>122</td>
                <td>HC #97 (28:2)</td>
              </tr>
              <tr valign="top">
                <td>Claude 3.5 Sonnet</td>
                <td>PT</td>
                <td>0</td>
                <td>0.972</td>
                <td>0.990/0.980/1</td>
                <td>0.957/1/0.913</td>
                <td>0.984</td>
                <td>122</td>
                <td>PD #23 (1:29)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup><italic>F</italic><sub>1</sub>-scores represent macro avg values across PD and HC classes.</p>
            </fn>
            <fn id="table7fn2">
              <p><sup>b</sup>macro avg: macro-averaged.</p>
            </fn>
            <fn id="table7fn3">
              <p><sup>c</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table7fn4">
              <p><sup>d</sup>HC: healthy controls.</p>
            </fn>
            <fn id="table7fn5">
              <p><sup>e</sup>Number of participants (out of 122) whose predictions were inconsistent at least once across 30 repeated trials.</p>
            </fn>
            <fn id="table7fn6">
              <p><sup>f</sup>Example of inconsistent participants showing the final label (eg, HC #23) and the number of predicted labels across 30 runs (eg, 29:1 indicates 29 HC and 1 PD predictions).</p>
            </fn>
            <fn id="table7fn7">
              <p><sup>g</sup>ST: special token.</p>
            </fn>
            <fn id="table7fn8">
              <p><sup>h</sup>PT: plain text.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Confusion matrices of large language models under best-performing dual-output prompting conditions on the test subset of the development set (n=122). HC: healthy controls; PD: Parkinson disease.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Generalizability was further assessed using the temporal validation set (n=31). As shown in <xref ref-type="table" rid="table8">Table 8</xref>, all models maintained strong diagnostic sensitivity, and recall for PD remained equal to 1 across all participants. GPT-4o achieved the highest macro-averaged <italic>F</italic><sub>1</sub>-score of 0.968 with an accuracy of 0.968. LLaMA 3.3 70B followed with a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.935. Gemini 1.5 Pro and Claude 3.5 Sonnet showed modest decreases in performance with <italic>F</italic><sub>1</sub>-scores of 0.903 and 0.869, respectively, primarily due to HC misclassifications.</p>
        <p><xref rid="figure6" ref-type="fig">Figure 6</xref> presents the confusion matrices for the temporal validation set under the best-performing configuration for each model.</p>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>Best performance of large language models under dual-output prompting on the temporal validation set (n=31).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="100"/>
            <col width="70"/>
            <col width="90"/>
            <col width="160"/>
            <col width="160"/>
            <col width="90"/>
            <col width="120"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Prompt</td>
                <td>Shot</td>
                <td><italic>F</italic><sub>1</sub>-score<sup>a</sup><break/>(95% CI)</td>
                <td>Precision (macro avg<sup>b</sup>/PD<sup>c</sup>/HC<sup>d</sup>)</td>
                <td>Recall (macro avg/PD/HC)</td>
                <td>Accuracy</td>
                <td>Inconsistency<sup>e</sup></td>
                <td>Inconsistent cases<sup>f</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LLaMA 3.3 70B</td>
                <td>ST<sup>g</sup></td>
                <td>3</td>
                <td>0.935 (0.832-1.000)</td>
                <td>0.941/0.882/1</td>
                <td>0.938/1/0.875</td>
                <td>0.935</td>
                <td>2</td>
                <td>HC #11 (23:7); HC #21 (8:22)</td>
              </tr>
              <tr valign="top">
                <td>GPT-4o</td>
                <td>PT<sup>h</sup></td>
                <td>1</td>
                <td>0.968 (0.896-1.000)</td>
                <td>0.969/0.938/1</td>
                <td>0.969/1/0.938</td>
                <td>0.968</td>
                <td>1</td>
                <td>HC #21 (29:1)</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Pro</td>
                <td>PT</td>
                <td>3</td>
                <td>0.903 (0.774-1.000)</td>
                <td>0.917/0.833/1</td>
                <td>0.906/1/0.813</td>
                <td>0.903</td>
                <td>0</td>
                <td>—<sup>i</sup></td>
              </tr>
              <tr valign="top">
                <td>Claude 3.5 Sonnet</td>
                <td>PT</td>
                <td>0</td>
                <td>0.869 (0.735-0.968)</td>
                <td>0.895/0.789/1</td>
                <td>0.875/1/0.750</td>
                <td>0.871</td>
                <td>1</td>
                <td>HC #10 (1:29)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table8fn1">
              <p><sup>a</sup><italic>F</italic><sub>1</sub>-scores represent macro avg values across PD and HC classes.</p>
            </fn>
            <fn id="table8fn2">
              <p><sup>b</sup>macro avg: macro-averaged.</p>
            </fn>
            <fn id="table8fn3">
              <p><sup>c</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table8fn4">
              <p><sup>d</sup>HC: healthy controls.</p>
            </fn>
            <fn id="table8fn5">
              <p><sup>e</sup>Number of participants (out of 31) whose predictions were inconsistent at least once across 30 runs.</p>
            </fn>
            <fn id="table8fn6">
              <p><sup>f</sup>Example of inconsistent participants showing the final label (eg, HC #11) and the number of predicted labels across 30 runs (eg, 23:7 indicates 23 HC and 7 PD predictions).</p>
            </fn>
            <fn id="table8fn7">
              <p><sup>g</sup>ST: special token.</p>
            </fn>
            <fn id="table8fn8">
              <p><sup>h</sup>PT: plain text.</p>
            </fn>
            <fn id="table8fn9">
              <p><sup>i</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Confusion matrices of large language models under best-performing dual-output prompting conditions on the temporal validation set (n=31). HC: healthy controls; PD: Parkinson disease.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Following the diagnostic classification results presented in <xref ref-type="table" rid="table7">Table 7</xref>, the semantic consistency of the post hoc explanatory texts was evaluated to assess the stability of generated explanations under dual-output prompting. Pairwise cosine similarity was computed among 30 post hoc explanatory texts generated for each participant, and the resulting mean and SD were averaged across all 122 test participants. As shown in <xref ref-type="table" rid="table9">Table 9</xref>, all models maintained high semantic consistency, with mean cosine similarity values exceeding 0.95. LLaMA 3.3 70B achieved the highest value under 0-shot prompting (0.997 ± 0.005), and the variation across models and shot settings was small (≤ 0.03). This evaluation was limited to the development set, and additional exploratory metrics for the temporal validation set are provided in <xref ref-type="supplementary-material" rid="app11">Multimedia Appendix 11</xref>.</p>
        <table-wrap position="float" id="table9">
          <label>Table 9</label>
          <caption>
            <p>Semantic consistency of reasoning outputs on the test dataset (n=122). Semantic consistency was assessed based on pairwise cosine similarity among 30 post hoc explanatory texts generated per participant.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="400"/>
            <col width="570"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Model</td>
                <td>Cosine similarity, mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">
                  <bold>LLaMA 3.3 70B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0-shot</td>
                <td>0.997 (0.005)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1-shot</td>
                <td>0.995 (0.009)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2-shot</td>
                <td>0.966 (0.016)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>3-shot</td>
                <td>0.980 (0.011)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>GPT-4o</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0-shot</td>
                <td>0.973 (0.016)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1-shot</td>
                <td>0.980 (0.011)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2-shot</td>
                <td>0.981 (0.011)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>3-shot</td>
                <td>0.975 (0.016)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Gemini 1.5 Pro</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0-shot</td>
                <td>0.969 (0.019)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1-shot</td>
                <td>0.968 (0.025)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2-shot</td>
                <td>0.956 (0.029)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>3-shot</td>
                <td>0.951 (0.026)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Claude 3.5 Sonnet</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0-shot</td>
                <td>0.985 (0.017)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1-shot</td>
                <td>0.986 (0.016)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2-shot</td>
                <td>0.987 (0.016)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>3-shot</td>
                <td>0.979 (0.015)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Fine-Tuned Prompting Performance of LLMs</title>
        <p>Supervised fine-tuning substantially improved the diagnostic performance of lightweight LLMs and resulted in more consistent classification across the evaluated datasets. <xref ref-type="table" rid="table10">Table 10</xref> summarizes the results of GPT-4o-mini and Gemini 1.5 Flash on the development test set (n=122).</p>
        <table-wrap position="float" id="table10">
          <label>Table 10</label>
          <caption>
            <p>Fine-tuning performance of lightweight large language models on the development test set (n=122).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="130"/>
            <col width="100"/>
            <col width="100"/>
            <col width="180"/>
            <col width="160"/>
            <col width="100"/>
            <col width="110"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Prompt</td>
                <td><italic>F</italic><sub>1</sub>-score<sup>a</sup></td>
                <td>Precision (macro avg<sup>b</sup>/PD<sup>c</sup>/HC<sup>d</sup>)</td>
                <td>Recall (macro avg/PD/HC)</td>
                <td>Accuracy</td>
                <td>Inconsistency<sup>e</sup></td>
                <td>Inconsistent cases<sup>f</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>GPT-4o-mini</td>
                <td>PT<sup>g</sup></td>
                <td>0.987</td>
                <td>0.995 / 0.990 / 1</td>
                <td>0.978 / 1 / 0.957</td>
                <td>0.992</td>
                <td>0</td>
                <td>—<sup>h</sup></td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Flash</td>
                <td>PT</td>
                <td>0.973</td>
                <td>0.990 / 0.980 / 1</td>
                <td>0.957 / 1 / 0.913</td>
                <td>0.984</td>
                <td>1</td>
                <td>HC #23 (9:21)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table10fn1">
              <p><sup>a</sup><italic>F</italic><sub>1</sub>-scores represent macro avg values across PD and HC classes.</p>
            </fn>
            <fn id="table10fn2">
              <p><sup>b</sup>macro avg: macro-averaged.</p>
            </fn>
            <fn id="table10fn3">
              <p><sup>c</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table10fn4">
              <p><sup>d</sup>HC: healthy controls.</p>
            </fn>
            <fn id="table10fn5">
              <p><sup>e</sup>Number of participants (out of 122) whose predictions were inconsistent at least once across 30 repeated trials.</p>
            </fn>
            <fn id="table10fn6">
              <p><sup>f</sup>Example of inconsistent participants showing the final label (eg, HC #23) and the number of predicted labels across 30 runs (eg, 9:21 indicates 9 HC and 21 PD predictions).</p>
            </fn>
            <fn id="table10fn7">
              <p><sup>g</sup>PT: plain text.</p>
            </fn>
            <fn id="table10fn8">
              <p><sup>h</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The fine-tuned GPT-4o-mini achieved the highest macro-averaged <italic>F</italic><sub>1</sub>-score of 0.987, with recall of 1.00 and stable predictions across 30 repeated trials. Gemini 1.5 Flash followed closely with an <italic>F</italic><sub>1</sub>-score of 0.973, maintaining an accuracy of 0.984 and showing only 1 inconsistent prediction (participant HC #23). Both models were trained and evaluated using PT prompts. The corresponding confusion matrices are shown in <xref rid="figure7" ref-type="fig">Figure 7</xref>. <xref rid="figure7" ref-type="fig">Figure 7</xref> displays the confusion matrices of the fine-tuned LLMs on the development test set, showing that nearly all misclassifications occurred among HC participants, whereas both models correctly identified all PD cases in the sample.</p>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Confusion matrices of lightweight large language models after fine-tuning on the development test set (n=122). HC: healthy controls; PD: Parkinson disease.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>To further assess model generalizability, both fine-tuned LLMs were evaluated on the temporal validation set (n=31). As summarized in <xref ref-type="table" rid="table11">Table 11</xref>, GPT-4o-mini achieved <italic>F</italic><sub>1</sub>-score, precision, recall, and accuracy of 1.000 on the temporal validation set, correctly classifying all participants in this sample. In contrast, Gemini 1.5 Flash demonstrated slightly lower but still strong performance with an <italic>F</italic><sub>1</sub>-score of 0.903 and an accuracy of 0.903. All PD participants were correctly identified, and the few errors were limited to HC participants. <xref rid="figure8" ref-type="fig">Figure 8</xref> presents the confusion matrices under the same fine-tuned PT prompting configuration. These findings confirm that fine-tuning notably improved classification stability while preserving sensitivity to PD in temporally independent data.</p>
        <table-wrap position="float" id="table11">
          <label>Table 11</label>
          <caption>
            <p>Fine-tuning performance of lightweight large language models on the temporal validation set (n=31).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="130"/>
            <col width="90"/>
            <col width="160"/>
            <col width="140"/>
            <col width="130"/>
            <col width="100"/>
            <col width="110"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Prompt</td>
                <td><italic>F</italic><sub>1</sub>-score<sup>a</sup><break/>(95% CI)</td>
                <td>Precision (macro avg<sup>b</sup>/PD<sup>c</sup>/HC<sup>d</sup>)</td>
                <td>Recall (macro avg/PD/HC)</td>
                <td>Accuracy</td>
                <td>Inconsistency<sup>e</sup></td>
                <td>Inconsistent cases<sup>f</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>GPT-4o-mini</td>
                <td>PT<sup>g</sup></td>
                <td>1 (1.000-1.000)</td>
                <td>1/1/1</td>
                <td>1/1/1</td>
                <td>1</td>
                <td>0</td>
                <td>—<sup>h</sup></td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Flash</td>
                <td>PT</td>
                <td>0.903 (0.770-1.000)</td>
                <td>0.917/0.833/1</td>
                <td>0.906/1/0.813</td>
                <td>0.903</td>
                <td>1</td>
                <td>HC #21 (25:5)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table11fn1">
              <p><sup>a</sup><italic>F</italic><sub>1</sub>-scores represent macro avg values across PD and HC classes.</p>
            </fn>
            <fn id="table11fn2">
              <p><sup>b</sup>macro avg: macro-averaged.</p>
            </fn>
            <fn id="table11fn3">
              <p><sup>c</sup>PD: Parkinson disease.</p>
            </fn>
            <fn id="table11fn4">
              <p><sup>d</sup>HC: healthy controls.</p>
            </fn>
            <fn id="table11fn5">
              <p><sup>e</sup>Number of participants (out of 31) whose predictions were inconsistent at least once across 30 runs.</p>
            </fn>
            <fn id="table11fn6">
              <p><sup>f</sup>Example of inconsistent participants showing the final label (eg, HC #21) and the number of predicted labels across 30 runs (eg, 25:5 indicates 25 HC and 5 PD predictions).</p>
            </fn>
            <fn id="table11fn7">
              <p><sup>g</sup>PT: plain text.</p>
            </fn>
            <fn id="table11fn8">
              <p><sup>h</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Confusion matrices of lightweight large language models after fine-tuning on the temporal validation set (n=31). HC: healthy controls; PD: Parkinson disease.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>These results confirm that with sufficient training data and consistent prompting formats, even small-scale LLMs can achieve classification accuracy comparable to larger models while maintaining stable performance.</p>
        <p><xref rid="figure9" ref-type="fig">Figure 9</xref> provides an integrated comparison of the best-performing models across all experimental settings, including traditional ML baselines, few-shot prompting with LLM (LLM_F), dual-output prompting with LLM (LLM-D), and fine-tuned prompting with LLM (LLM_FT). Across the development dataset, multiple LLM configurations achieved macro-averaged <italic>F</italic><sub>1</sub>-scores comparable to LR, and fine-tuned lightweight models reached the highest overall performance.</p>
        <fig id="figure9" position="float">
          <label>Figure 9</label>
          <caption>
            <p>Comparison of the best macro-averaged F1-scores from the top-performing models in machine learning (ML) and large language model experiments across few-shot, dual-output, and fine-tuned prompting on the development and temporal validation sets.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e77561_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>On the temporal validation dataset, LR maintained moderate generalizability, whereas SVM showed substantial degradation when applied to temporally separated data. In contrast, several LLM configurations preserved high recall for PD and sustained overall performance, particularly under fine-tuned and few-shot prompting conditions. These results demonstrate that while ML baselines provide deterministic reference points, LLMs exhibit greater flexibility across prompting strategies and maintain stable sensitivity to PD in both datasets.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Overview</title>
        <p>This study examined how modern LLMs process structured clinical variables when these variables are reformatted into natural language prompts for the diagnostic classification of PD. Using SHAP-selected features derived from the PPMI dataset, we compared multiple LLM families and prompting strategies with conventional ML baselines. Three main findings emerged. First, several LLMs achieved diagnostic performance comparable to LR while maintaining high sensitivity for PD across both the development test set and the temporal validation set. Second, the diagnostic behavior of LLMs varied depending on prompt format, model family, and shot configuration, whereas the ML baselines produced deterministic and highly stable predictions. Third, supervised fine-tuning markedly improved both accuracy and output stability in lightweight LLMs, allowing a compact model such as GPT-4o-mini to correctly classify all participants in the temporal validation set. In addition, because the ML baselines in this study were minimally tuned, part of any performance gap between ML models and LLMs may reflect limited optimization of the ML baselines rather than true methodological differences. Accordingly, the present comparisons should be interpreted as exploratory rather than definitive.</p>
        <p>The performance of the LLaMA family was strongly influenced by input formatting. In particular, the inclusion or removal of STs resulted in notable differences in accuracy and sensitivity, although their classification results varied more widely across different shot settings [<xref ref-type="bibr" rid="ref29">29</xref>]. Dual-output prompting, which required models to generate diagnostic labels along with post hoc explanatory text, resulted in slightly lower <italic>F</italic><sub>1</sub>-scores compared with diagnostic-only prompting but did not substantially destabilize predictions. The generated text exhibited high semantic consistency across repeated trials. These explanations should be regarded as post hoc natural language outputs rather than indicators of true model interpretability, since they are produced after the model’s primary diagnostic prediction step [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        <p>To evaluate whether inconsistent predictions reflected clinically ambiguous participants rather than model-level variability, we conducted a qualitative review of cases that exhibited the highest numbers of label inconsistencies. A total of 2 HC participants and 2 PD participants were selected for detailed examination. All 10 SHAP-selected variables, including Unified Parkinson Disease Rating Scale (UPDRS) motor scores, University of Pennsylvania Smell Identification Test (UPSIT) percentiles, and dopamine transporter single-photon emission computed tomography putaminal uptake metrics, were compared with the overall distributions of the PD and HC groups. None of the reviewed cases demonstrated borderline or contradictory clinical profiles. The HC cases showed preserved dopaminergic uptake and normal motor assessments, with only mild olfactory reductions typical of healthy older adults. The PD cases exhibited reduced dopaminergic activity, clear asymmetry, and motor impairment consistent with established PD patterns. These observations suggest that label inconsistencies are unlikely to arise from underlying clinical ambiguity. Instead, they appear to reflect model stochasticity and prompt-dependent variability [<xref ref-type="bibr" rid="ref33">33</xref>].</p>
        <p>Supervised fine-tuning clarified the role of training data in stabilizing LLM predictions. When provided with labeled examples, both GPT-4o-mini and Gemini 1.5 Flash demonstrated substantial improvements in diagnostic accuracy and showed consistently high sensitivity to PD on the temporal validation set. GPT-4o-mini classified all 31 participants correctly after fine-tuning. This result suggests that compact models can approximate or exceed the performance of larger models when trained on appropriately structured datasets. It also indicates that fine-tuning can reduce susceptibility to prompt-level variability and may support more reliable behavior in clinical decision-support environments. However, this comparison should be interpreted with caution. Although architectural differences may also contribute to the observed performance gap, GPT-4o-mini was fine-tuned on more than twice as many labeled samples as Gemini 1.5 Flash (1052 vs 500) due to platform constraints. Part of GPT-4o-mini’s superior performance may therefore reflect the larger amount of training data rather than inherent model advantages, which limits how directly the two models can be compared.</p>
        <p>Overall, this study illustrates both the potential and the limitations of modern LLMs for processing structured clinical variables that are presented in natural language form. While several models achieved strong diagnostic performance and generalized well to temporally separated data, their outputs remained sensitive to prompt structures, model architectures, and few-shot configurations. Occasional inconsistencies across repeated runs further highlight the stochastic nature of LLM output generation [<xref ref-type="bibr" rid="ref33">33</xref>]. These characteristics reinforce the importance of careful interpretation and the need for rigorous evaluation frameworks before LLMs can be integrated safely into real-world diagnostic workflows.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Several limitations should be considered when interpreting these findings. First, although temporal validation provided an important assessment of model generalizability, the temporal validation set was relatively small (n=31), which resulted in wide CIs for the reported performance metrics. In addition, models were trained on datasets that included imputed values, whereas evaluation was conducted on datasets restricted to complete cases without missing data, including both the development test set and the temporal validation set. This mismatch may introduce a distributional shift and result in performance estimates that reflect a best-case evaluation scenario rather than real-world clinical conditions where missing data are common.</p>
        <p>Second, the 10 features used for model input were selected using SHAP values from tree-based models. Feature sets obtained using alternative selection strategies may differ, so the current feature subset may not fully represent model-agnostic feature selection. Furthermore, the explanatory text generated by LLMs under the dual-output prompting framework was not reviewed by clinical experts. Accordingly, the semantic consistency metric reflects internal textual stability rather than clinically accurate or factually grounded post hoc explanatory text, and the clinical validity of the generated explanations remains unverified.</p>
        <p>Finally, several methodological constraints limit direct model-to-model comparisons. Prompt format and shot configuration were selected based on performance observed on the development test dataset, rather than using a separate validation set for configuration selection. This design choice reflects the study’s aim to broadly compare model behaviors rather than to establish definitive optimal configurations. In addition, platform-specific constraints limited the extent of fine-tuning that could be performed across LLMs. While GPT-4o-mini was fine-tuned using 1052 training samples with an additional held-out validation set of 186 samples, the Gemini 1.5 Flash fine-tuning interface restricts supervised training to a maximum of 500 samples, and the machine learning baselines were trained on 990 samples. As a result, the observed performance differences across models may reflect differences in training data availability rather than inherent architectural superiority and should be interpreted as exploratory.</p>
        <p>Prompt structuring flexibility also differed across platforms, as the ST format was applied only to the LLaMA models due to platform-specific input constraints, further limiting the degree to which direct model-to-model comparisons can be made in this study.</p>
      </sec>
      <sec>
        <title>Future Work</title>
        <p>Future research should expand these results in several directions. Larger temporal or external datasets, including real-world clinical settings, are needed to strengthen generalizability assessments. Additional work is warranted to examine optimization strategies for prompt design, temperature settings, and calibration methods that may reduce stochastic variability. Expert-based evaluation of generated post hoc explanatory text may clarify how these outputs can be used to support clinical decision-making. Further exploration of supervised fine-tuning for additional lightweight LLMs could help identify resource-efficient models suitable for deployment in constrained clinical environments. Finally, integrating imaging, sensor-derived digital biomarkers, and longitudinal clinical trajectories may clarify how LLMs can combine multimodal biomedical data for diagnostic tasks.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study provides an exploratory benchmark of how LLMs process structured clinical variables when presented in natural language form. Multiple LLMs achieved diagnostic performance comparable to conventional ML baselines and maintained high sensitivity for PD under temporal validation. However, their predictions were influenced by prompt format, shot configuration, and model architecture, and occasional inconsistencies reflected inherent stochasticity rather than clinical ambiguity. Supervised fine-tuning substantially improved reliability in lightweight models, demonstrating that compact architectures can achieve stable and high-performing classification when trained on sufficient labeled examples. These findings highlight both the opportunities and the challenges associated with applying LLMs to structured clinical data and emphasize the need for rigorous evaluation before clinical implementation.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Performance and weights of tree-based models.</p>
        <media xlink:href="medinform_v14i1e77561_app1.doc" xlink:title="DOC File , 59 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Weighted mean SHAP values for the top 10 selected variables.</p>
        <media xlink:href="medinform_v14i1e77561_app2.doc" xlink:title="DOC File , 59 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Prompt formats by structure type and output design.</p>
        <media xlink:href="medinform_v14i1e77561_app3.doc" xlink:title="DOC File , 51 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Hyperparameter configurations for logistic regression and support vector machine classifiers.</p>
        <media xlink:href="medinform_v14i1e77561_app4.doc" xlink:title="DOC File , 58 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Description of model families and API.</p>
        <media xlink:href="medinform_v14i1e77561_app5.doc" xlink:title="DOC File , 63 KB"/>
      </supplementary-material>
      <supplementary-material id="app6">
        <label>Multimedia Appendix 6</label>
        <p>Experimental timeline of model evaluations.</p>
        <media xlink:href="medinform_v14i1e77561_app6.doc" xlink:title="DOC File , 60 KB"/>
      </supplementary-material>
      <supplementary-material id="app7">
        <label>Multimedia Appendix 7</label>
        <p>Hardware and software configuration.</p>
        <media xlink:href="medinform_v14i1e77561_app7.doc" xlink:title="DOC File , 57 KB"/>
      </supplementary-material>
      <supplementary-material id="app8">
        <label>Multimedia Appendix 8</label>
        <p>Summary of fine-tuning configurations for lightweight language models.</p>
        <media xlink:href="medinform_v14i1e77561_app8.doc" xlink:title="DOC File , 60 KB"/>
      </supplementary-material>
      <supplementary-material id="app9">
        <label>Multimedia Appendix 9</label>
        <p>Diagnostic performance of large language models in Parkinson disease classification using prompt engineering and few-shot learning.</p>
        <media xlink:href="medinform_v14i1e77561_app9.doc" xlink:title="DOC File , 169 KB"/>
      </supplementary-material>
      <supplementary-material id="app10">
        <label>Multimedia Appendix 10</label>
        <p>Diagnostic generalization of large language models under dual-output prompting on the test dataset.</p>
        <media xlink:href="medinform_v14i1e77561_app10.doc" xlink:title="DOC File , 85 KB"/>
      </supplementary-material>
      <supplementary-material id="app11">
        <label>Multimedia Appendix 11</label>
        <p>Semantic consistency of reasoning outputs on the temporal validation set under dual-output prompting.</p>
        <media xlink:href="medinform_v14i1e77561_app11.doc" xlink:title="DOC File , 33 KB"/>
      </supplementary-material>
      <supplementary-material id="app12">
        <label>Multimedia Appendix 12</label>
        <p>Complete code and dataset package used for all experiments, including preprocessing, few-shot prompting, reasoning consistency analysis, and supervised fine-tuning.</p>
        <media xlink:href="medinform_v14i1e77561_app12.zip" xlink:title="ZIP File  (Zip Archive), 306 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">HC</term>
          <def>
            <p>healthy controls</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LLM_D</term>
          <def>
            <p>dual-output prompting with LLM</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LLM_F</term>
          <def>
            <p>few-shot prompting with LLM</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">LLM_FT</term>
          <def>
            <p>fine-tuned prompting with LLM</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MD</term>
          <def>
            <p>markdown</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MD+ST</term>
          <def>
            <p>markdown with special token</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">PD</term>
          <def>
            <p>Parkinson disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">PPMI</term>
          <def>
            <p>Parkinson’s Progression Markers Initiative</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">PT</term>
          <def>
            <p>plain text</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">RBF</term>
          <def>
            <p>radial basis function</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SHAP</term>
          <def>
            <p>Shapley additive explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">ST</term>
          <def>
            <p>special token</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">UPDRS</term>
          <def>
            <p>Unified Parkinson Disease Rating Scale</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">UPSIT</term>
          <def>
            <p>University of Pennsylvania Smell Identification Test</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to thank Editage for English language editing. We also acknowledge the use of generative artificial intelligence (AI) tools (ChatGPT by OpenAI and Grok by xAI) under author supervision for language editing and phrasing refinement during manuscript preparation. All analyses, experimental designs, and interpretations related to LLM diagnostics were fully conceived, conducted, and verified solely by the authors.</p>
    </ack>
    <notes>
      <sec>
        <title>Funding</title>
        <p>This work was supported by 2 funding sources. First, the Regional Innovation System &#38; Education (RISE) program through the Institute of Regional Innovation System &#38; Education in Busan Metropolitan City, funded by the Ministry of Education (MOE) and the Busan Metropolitan City, Republic of Korea (2025-RISE-02-003). Second, the Global-Learning &#38; Academic research institution for Master’s·PhD students, and Postdocs (G-LAMP) Program of the National Research Foundation of Korea (NRF), grant funded by the Ministry of Education (RS-2025-2540216). The funders had no role in the study design, data collection, analysis, interpretation of results, manuscript writing, or the decision to submit the work for publication.</p>
      </sec>
    </notes>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The datasets analyzed during this study are available from the Parkinson’s Progression Markers Initiative (PPMI) repository, subject to PPMI data access policies. Selected code and processed data supporting the findings of this study are included in <xref ref-type="supplementary-material" rid="app12">Multimedia Appendix 12</xref>. Additional materials can be shared upon reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>HJS was responsible for the conceptualization, methodology, data curation, formal analysis, investigation, visualization, and writing–original draft. DYK, as the corresponding author, provided overall supervision, project administration, funding acquisition, and contributed to conceptualization and writing–review and editing. YJJ and SMJ contributed to validation and provided clinical insights and feedback during manuscript reviews. All authors read and approved the final version of the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shakya</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Prevett</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Characterization of Parkinson's disease subtypes and related attributes</article-title>
          <source>Front Neurol</source>
          <year>2022</year>
          <volume>13</volume>
          <fpage>810038</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35677337"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fneur.2022.810038</pub-id>
          <pub-id pub-id-type="medline">35677337</pub-id>
          <pub-id pub-id-type="pmcid">PMC9167933</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tolosa</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Garrido</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Scholz</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Poewe</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Challenges in the diagnosis of Parkinson's disease</article-title>
          <source>Lancet Neurol</source>
          <year>2021</year>
          <volume>20</volume>
          <issue>5</issue>
          <fpage>385</fpage>
          <lpage>397</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33894193"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S1474-4422(21)00030-2</pub-id>
          <pub-id pub-id-type="medline">33894193</pub-id>
          <pub-id pub-id-type="pii">S1474-4422(21)00030-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC8185633</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Virameteekul</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Revesz</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Jaunmuktane</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Warner</surname>
              <given-names>TT</given-names>
            </name>
            <collab>De Pablo-Fernández</collab>
          </person-group>
          <article-title>Clinical diagnostic accuracy of Parkinson's Disease: Where do we stand?</article-title>
          <source>Mov Disord</source>
          <year>2023</year>
          <volume>38</volume>
          <issue>4</issue>
          <fpage>558</fpage>
          <lpage>566</lpage>
          <pub-id pub-id-type="doi">10.1002/mds.29317</pub-id>
          <pub-id pub-id-type="medline">36602274</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jankovic</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Parkinson's disease: Clinical features and diagnosis</article-title>
          <source>J Neurol Neurosurg Psychiatry</source>
          <year>2008</year>
          <volume>79</volume>
          <issue>4</issue>
          <fpage>368</fpage>
          <lpage>376</lpage>
          <pub-id pub-id-type="doi">10.1136/jnnp.2007.131045</pub-id>
          <pub-id pub-id-type="medline">18344392</pub-id>
          <pub-id pub-id-type="pii">79/4/368</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <article-title>Statistics</article-title>
          <source>Parkinson’s Foundation</source>
          <access-date>2025-02-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.parkinson.org/understanding-parkinsons/statistics">https://www.parkinson.org/understanding-parkinsons/statistics</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Parkinson Progression Marker Initiative</collab>
          </person-group>
          <article-title>The Parkinson Progression Marker Initiative (PPMI)</article-title>
          <source>Prog Neurobiol</source>
          <year>2011</year>
          <volume>95</volume>
          <issue>4</issue>
          <fpage>629</fpage>
          <lpage>635</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/21930184"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.pneurobio.2011.09.005</pub-id>
          <pub-id pub-id-type="medline">21930184</pub-id>
          <pub-id pub-id-type="pii">S0301-0082(11)00165-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC9014725</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hosmer</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lemeshow</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sturdivant</surname>
              <given-names>RX</given-names>
            </name>
          </person-group>
          <source>Applied Logistic Regression. 3rd Edition</source>
          <year>2013</year>
          <publisher-loc>Hoboken</publisher-loc>
          <publisher-name>John Wiley &#38; Sons</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cortes</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Vapnik</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Support-vector networks</article-title>
          <source>Mach Learn</source>
          <year>1995</year>
          <volume>20</volume>
          <issue>3</issue>
          <fpage>273</fpage>
          <lpage>297</lpage>
          <pub-id pub-id-type="doi">10.1007/bf00994018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gerraty</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Provost</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Haas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lancashire</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Machine learning within the Parkinson's progression markers initiative: review of the current state of affairs</article-title>
          <source>Front Aging Neurosci</source>
          <year>2023</year>
          <volume>15</volume>
          <fpage>1076657</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36861121"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fnagi.2023.1076657</pub-id>
          <pub-id pub-id-type="medline">36861121</pub-id>
          <pub-id pub-id-type="pmcid">PMC9968811</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alex</surname>
              <given-names>JSR</given-names>
            </name>
          </person-group>
          <article-title>Early detection of Parkinson’s disease using motor symptoms and machine learning</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on April 18, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2304.09245"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pradeep</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kamalakannan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Comprehensive review of literature on Parkinson's disease diagnosis</article-title>
          <source>Comput Biol Chem</source>
          <year>2024</year>
          <volume>113</volume>
          <fpage>108228</fpage>
          <pub-id pub-id-type="doi">10.1016/j.compbiolchem.2024.108228</pub-id>
          <pub-id pub-id-type="medline">39413446</pub-id>
          <pub-id pub-id-type="pii">S1476-9271(24)00216-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSJ</given-names>
            </name>
            <name name-style="western">
              <surname>Elangovan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gutierrez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medicine</article-title>
          <source>Nat Med</source>
          <year>2023</year>
          <volume>29</volume>
          <issue>8</issue>
          <fpage>1930</fpage>
          <lpage>1940</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>
          <pub-id pub-id-type="medline">37460753</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-023-02448-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nazi</surname>
              <given-names>ZA</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Large language models in healthcare and medical domain: a review</article-title>
          <source>Informatics</source>
          <year>2024</year>
          <volume>11</volume>
          <issue>3</issue>
          <fpage>57</fpage>
          <pub-id pub-id-type="doi">10.3390/informatics11030057</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wiegreffe</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Marasović</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Teach me to explain: a review of datasets for explainable natural language processing</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 24, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2102.12060?"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan Sim</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Nik</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Interactive natural language processing</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on May 22, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2305.13246"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sui</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Table meets LLM: can large language models understand structured table data?</article-title>
          <source>Proceedings of the 17th ACM International Conference on Web Search and Data Mining (WSDM ’24)</source>
          <year>2024</year>
          <fpage>645</fpage>
          <lpage>654</lpage>
          <pub-id pub-id-type="doi">10.1145/3616855.3635752</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Afkanpour</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hosseinzadeh</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Tabesh</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Identify the most appropriate imputation method for handling missing values in clinical structured datasets: a systematic review</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2024</year>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>188</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-024-02310-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12874-024-02310-6</pub-id>
          <pub-id pub-id-type="medline">39198744</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12874-024-02310-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC11351057</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Hond</surname>
              <given-names>AAH</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>VB</given-names>
            </name>
            <name name-style="western">
              <surname>Kant</surname>
              <given-names>IMJ</given-names>
            </name>
            <name name-style="western">
              <surname>Van Calster</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez-Boussard</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Perspectives on validation of clinical predictive algorithms</article-title>
          <source>NPJ Digit Med</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>86</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-023-00832-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-023-00832-9</pub-id>
          <pub-id pub-id-type="medline">37149704</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-023-00832-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC10163568</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Harrell</surname>
              <given-names>FE</given-names>
            </name>
          </person-group>
          <article-title>Prediction models need appropriate internal, internal-external, and external validation</article-title>
          <source>J Clin Epidemiol</source>
          <year>2016</year>
          <volume>69</volume>
          <fpage>245</fpage>
          <lpage>247</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/25981519"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jclinepi.2015.04.005</pub-id>
          <pub-id pub-id-type="medline">25981519</pub-id>
          <pub-id pub-id-type="pii">S0895-4356(15)00175-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC5578404</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Aloe</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Becker</surname>
              <given-names>BJ</given-names>
            </name>
          </person-group>
          <article-title>Extracting the variance inflation factor and other multicollinearity diagnostics from typical regression results</article-title>
          <source>Basic and Applied Social Psychology</source>
          <year>2017</year>
          <volume>39</volume>
          <issue>2</issue>
          <fpage>81</fpage>
          <lpage>90</lpage>
          <pub-id pub-id-type="doi">10.1080/01973533.2016.1277529</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Neter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nachtsheim</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kutner</surname>
              <given-names>MH</given-names>
            </name>
          </person-group>
          <source>Applied Linear Regression Models. 4th Edition</source>
          <year>2004</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>McGraw-Hill/Irwin</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O’brien</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>A caution regarding rules of thumb for variance inflation factors</article-title>
          <source>Qual Quant</source>
          <year>2007</year>
          <volume>41</volume>
          <issue>5</issue>
          <fpage>673</fpage>
          <lpage>690</lpage>
          <pub-id pub-id-type="doi">10.1007/s11135-006-9018-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Breiman</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Random forests</article-title>
          <source>Mach Learn</source>
          <year>2011</year>
          <volume>45</volume>
          <issue>1</issue>
          <fpage>5</fpage>
          <lpage>32</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://pophealthmetrics.biomedcentral.com/articles/10.1186/1478-7954-9-29"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1478-7954-9-29</pub-id>
          <pub-id pub-id-type="pii">1478-7954-9-29</pub-id>
          <pub-id pub-id-type="pmcid">PMC3160922</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Xgboost: a scalable tree boosting system</article-title>
          <source>22nd ACM SIGKDD International Conference</source>
          <year>2016</year>
          <conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD ’16)</conf-name>
          <conf-date>August 13-17, 2016</conf-date>
          <conf-loc>San Francisco</conf-loc>
          <fpage>785</fpage>
          <lpage>794</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ke</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Finley</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Guyon</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Luxburg</surname>
              <given-names>UV</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wallach</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Fergus</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Vishwanathan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Garnett</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>LightGBM: a highly efficient gradient boosting decision tree</article-title>
          <source>Advances in Neural Information Processing Systems 30 (NeurIPS 2017)</source>
          <year>2017</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Curran Associates, Inc</publisher-name>
          <fpage>3146</fpage>
          <lpage>3154</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Prokhorenkova</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gusev</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Vorobev</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dorogush</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gulin</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>CatBoost: unbiased boosting with categorical features</article-title>
          <source>Advances in Neural Information Processing Systems 31 (NeurIPS 2018)</source>
          <year>2018</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Curran Associates, Inc</publisher-name>
          <fpage>6638</fpage>
          <lpage>6648</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shwartz-Ziv</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Armon</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Tabular data: deep learning is not all you need</article-title>
          <source>Inf Fusion</source>
          <year>2022</year>
          <volume>81</volume>
          <fpage>84</fpage>
          <lpage>90</lpage>
          <pub-id pub-id-type="doi">10.1016/j.inffus.2021.11.011</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>SI</given-names>
            </name>
          </person-group>
          <article-title>A unified approach to interpreting model predictions</article-title>
          <source>Advances in Neural Information Processing Systems</source>
          <year>2017</year>
          <access-date>2025-06-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper_files/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf">https://papers.nips.cc/paper_files/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rungta</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Koleczek</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sekhon</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hasan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Does prompt formatting have any impact on LLM performance?</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on November 15, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2411.10541</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Creswell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shanahan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Higgins</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Selection-inference: exploiting large language models for interpretable logical reasoning</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on May 19, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2205.09712"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saito</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Rehmsmeier</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>e0118432</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0118432"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0118432</pub-id>
          <pub-id pub-id-type="medline">25738806</pub-id>
          <pub-id pub-id-type="pii">PONE-D-14-26790</pub-id>
          <pub-id pub-id-type="pmcid">PMC4349800</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carpenter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bithell</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Bootstrap confidence intervals: when, which, what? A practical guide for medical statisticians</article-title>
          <source>Statist. Med</source>
          <year>2000</year>
          <volume>19</volume>
          <issue>9</issue>
          <fpage>1141</fpage>
          <lpage>1164</lpage>
          <pub-id pub-id-type="doi">10.1002/(sici)1097-0258(20000515)19:9&#60;1141::aid-sim479&#62;3.0.co;2-f</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>The good, the bad, and the greedyvaluation of LLMs should not ignore non-determinism</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on July 15, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2407.10457"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2025.naacl-long.211</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kojima</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Reid</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Large language models are zero-shot reasoners</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on May 24, 2022</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2205.11916</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reimers</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gurevych</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Sentence-BERTntence embeddings using Siamese BERT-networks</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on August 27, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1908.10084"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T-Y</given-names>
            </name>
          </person-group>
          <article-title>MPNet: masked and permuted pre-training for language understanding</article-title>
          <source>ArXiv.</source>
          <comment>Preprint posted online on April 20, 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2004.09297"/>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
