<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e94063</article-id><article-id pub-id-type="doi">10.2196/94063</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>A Machine Learning Approach to Voice-Based Parkinson Disease Screening Using Multiview Spectrogram and Speech Recognition Features: Diagnostic Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Zahir</surname><given-names>Arifa</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yu</surname><given-names>Jaehong</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jun</surname><given-names>Jin-Sun</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Park</surname><given-names>Kiwon</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Kim</surname><given-names>Ryul</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Jeong</surname><given-names>Hyundoo</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biomedical and Robotics Engineering, Incheon National University</institution><addr-line>119 Academy-ro, Yeonsu-gu</addr-line><addr-line>Incheon</addr-line><country>Republic of Korea</country></aff><aff id="aff2"><institution>Department of Industrial and Management Engineering, Incheon National University</institution><addr-line>Incheon</addr-line><country>Republic of Korea</country></aff><aff id="aff3"><institution>Department of Neurology, Kangnam Sacred Heart Hospital, Hallym University College of Medicine</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff4"><institution>Department of Neurology, Seoul Metropolitan Government-Seoul National University Boramae Medical Center, Seoul National University College of Medicine</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Focsa</surname><given-names>Mircea</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Jiang</surname><given-names>Shan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ren</surname><given-names>Shaogang</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hyundoo Jeong, PhD, Department of Biomedical and Robotics Engineering, Incheon National University, 119 Academy-ro, Yeonsu-gu, Incheon, 22012, Republic of Korea, 82 32-835-8677; <email>hdj@inu.ac.kr</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>11</day><month>6</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e94063</elocation-id><history><date date-type="received"><day>24</day><month>02</month><year>2026</year></date><date date-type="rev-recd"><day>24</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>10</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Arifa Zahir, Jaehong Yu, Jin-Sun Jun, Kiwon Park, Ryul Kim, Hyundoo Jeong. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 11.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e94063"/><abstract><sec><title>Background</title><p>Parkinson disease frequently manifests early vocal impairment, motivating the development of noninvasive and scalable digital screening tools.</p></sec><sec><title>Objective</title><p>This study proposes a multiview spectrogram-based deep learning framework integrating recognition-aware context for Parkinson disease detection from voice recordings.</p></sec><sec sec-type="methods"><title>Methods</title><p>Voice recordings from 203 participants (121 with Parkinson disease and 82 healthy controls) were collected prospectively. Three spectrogram representations (Mel, short-time Fourier transform, and constant-Q transform) were extracted and processed through parallel convolutional neural network branches. A recognition ratio (RR) feature vector derived from automatic speech recognition transcript agreement was optionally fused with spectrogram embeddings. Models were evaluated using strict subject-wise 5-fold cross-validation.</p></sec><sec sec-type="results"><title>Results</title><p>Multiview spectrogram recognition-aware Parkinson detection network achieved a mean test accuracy of 86.9% (SD 25.2%) using 3-view spectrogram fusion, improving to 97.4% (SD 5.7%) when incorporating the RR feature. RR integration reduced the false negative rate by approximately 84.5%, substantially improving sensitivity in screening-oriented settings.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Combining multiview spectrogram learning with recognition-aware context significantly enhances voice-based Parkinson disease classification under leakage-free evaluation. These findings support the potential of this approach for noninvasive screening in structured recording settings, while further validation in diverse real-world environments is needed.</p></sec></abstract><kwd-group><kwd>Parkinson disease</kwd><kwd>voice-based screening</kwd><kwd>multiview spectrogram</kwd><kwd>deep learning</kwd><kwd>multiview learning</kwd><kwd>automatic speech recognition</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Parkinson disease is a neurological condition that progresses over time and manifests with various motor and nonmotor symptoms [<xref ref-type="bibr" rid="ref1">1</xref>]. There is an increasing need for accurate health informatics systems to support its identification because early detection can improve clinical outcomes and enable timely intervention. Automated screening tools can also reduce the workload for clinicians and support large-scale monitoring [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Among accessible sensing modalities, vocal impairment is one of the most prevalent early symptoms, and voice-based assessment has become an important direction for Parkinson disease identification research [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Speech impairments are highly prevalent in Parkinson disease and may involve both speech production and language-related difficulties [<xref ref-type="bibr" rid="ref7">7</xref>]. Clinically, these impairments encompass reduced vocal loudness (hypophonia), imprecise articulation, a monotone or breathy voice quality, and festinating speech, all of which reflect the combined effects of motor rigidity, bradykinesia, and reduced respiratory drive on the phonatory system [<xref ref-type="bibr" rid="ref8">8</xref>]. Increasing evidence suggests that speech and language abnormalities can emerge prior to prominent motor signs and formal diagnosis [<xref ref-type="bibr" rid="ref9">9</xref>]. Consequently, speech and language pathology has been recognized in clinical guidelines as a crucial component of Parkinson disease care from the early stage of diagnosis [<xref ref-type="bibr" rid="ref8">8</xref>]. Recent research highlights the potential of objective acoustic markers to identify Parkinson disease in early or prodromal stages, creating a therapeutic window for early intervention [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Speech and language characteristics can also serve as surrogate markers for tracking disease progression [<xref ref-type="bibr" rid="ref12">12</xref>], and distinct patterns of impairment have been linked to Parkinson disease subtypes and related movement disorders, supporting differential diagnosis using voice biomarkers [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Earlier studies in Parkinson disease voice analysis primarily relied on handcrafted acoustic features such as jitter, shimmer, harmonics-to-noise ratio, and Mel-frequency cepstral coefficients, combined with supervised classifiers such as support vector machines, random forests, or gradient-boosted trees [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. More recent studies extend this paradigm through improved feature selection, interpretable machine learning, and cross-corpus evaluation [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. Despite these advances, many approaches still depend on manually engineered descriptors and may underuse the richer structure present in the speech signal. To better capture Parkinson disease&#x2013;related phonatory and articulatory cues, many groups transform audio into time-frequency representations and apply convolutional neural networks (CNNs) or transformer models directly to spectrograms [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Spectrogram features combined with artificial intelligence models have achieved strong performance for early diagnosis, supporting the clinical viability of spectrogram-driven approaches [<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>Despite these advances, most models still operate on a single spectrogram view. This is a limitation because different time-frequency representations emphasize complementary aspects of the signal: the short-time Fourier transform (STFT) provides a linear frequency axis with uniform resolution, Mel spectrograms approximate human auditory spacing and emphasize low-to-mid frequencies, and the constant-Q transform (CQT) yields a logarithmic frequency grid that can better represent harmonic and pitch-related patterns relevant to the vocal tremor and dysphonia observed in Parkinson disease. Prior research works in audio and biomedical sound classification show that fusing multiple spectrogram representations can yield more discriminative embeddings than any single representation alone [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. For Parkinson disease speech, however, multispectrogram fusion remains underexplored [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>A second challenge is data scarcity and overfitting. Even recent cohorts often include only a few dozen to a few hundred participants, and many studies rely on highly reused benchmark datasets. Previous studies have emphasized concerns, such as participant overlap between training and evaluation sets and optimistic performance estimates, that hinder clinical translation [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. To address these issues, our study emphasizes strict participant-wise separation, controlled preprocessing, consistent model comparisons, and ablation-based analysis.</p><p>Finally, global context&#x2013;based and recognition-based voice features are rarely integrated into voice-based Parkinson disease classification models. Digital biomarker research increasingly highlights the value of multimodal and metadata-informed fusion for stabilizing predictions [<xref ref-type="bibr" rid="ref29">29</xref>-<xref ref-type="bibr" rid="ref31">31</xref>]. For voice, a compact recognition-based feature vector can be derived from the same speech recording, referred to here as the recognition ratio (RR). Intuitively, the RR measures how accurately an automatic speech recognition system can transcribe what a participant said: a high RR indicates clear, intelligible speech, whereas a low value reflects speech that is difficult for the system to parse&#x2014;consistent with the articulatory and phonation difficulties clinically observed in Parkinson disease [<xref ref-type="bibr" rid="ref7">7</xref>]. People with Parkinson disease may experience difficulty in clearly pronouncing sentences, which can reduce speech intelligibility and articulation clarity. As a result, the RR can serve as a global, recording-level indicator of intelligibility, providing complementary context to the local spectro-temporal patterns learned from spectrograms. Such recognition-aware integration strategies remain uncommon in voice-based Parkinson disease classification models. From a clinical perspective, these speech abnormalities are directly reflected in acoustic representations of voice. Time-frequency spectrograms can capture changes in vocal intensity, pitch stability, and articulation patterns associated with Parkinson disease speech impairment, while the RR provides a complementary measure of speech intelligibility at the utterance level [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Therefore, combining multiview spectrogram features with recognition-aware information allows the model to capture both local acoustic patterns and global intelligibility deficits that are clinically relevant in Parkinson disease.</p><p>In this study, we propose a multiview spectrogram-based deep architecture for noninvasive Parkinson disease screening from voice. From each recording, we derive 3 normalized spectrogram types (Mel, STFT, and CQT) and feed them into parallel CNN branches whose outputs are concatenated. We further introduce a low-dimensional RR vector computed from the same audio but outside the image domain and concatenate it with the spectrogram-based representation to provide global context. Model evaluation is performed under participant-wise cross-validation with strict separation of speakers between folds.</p><p>The contributions of this study are 3-fold:</p><list list-type="order"><list-item><p>A multibranch CNN architecture is introduced that exploits Mel, STFT, and CQT spectrograms through parallel branches, and multiview feature concatenation is shown to improve performance compared with single-view models and recent spectrogram-based baselines.</p></list-item><list-item><p>The effect of recognition-based features is evaluated by integrating a low-dimensional RR vector as lightweight global context. Performance gains are quantified using accuracy, precision, recall, <italic>F</italic><sub>1</sub>-score, and area under the receiver operating characteristic curve, demonstrating that contextual information can stabilize Parkinson disease voice classification.</p></list-item><list-item><p>A unified benchmark comparison is presented between classical acoustic feature machine learning baselines and spectrogram-based CNNs using a consistent participant-wise protocol. Comparison with recent state-of-the-art deep learning methods is also provided to contextualize the approach within the broader literature.</p></list-item></list></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dataset</title><p>A total of 203 participants were enrolled, including 121 individuals diagnosed with Parkinson disease and 82 healthy controls. The Parkinson disease group comprised 53 female participants and 68 male participants, while the healthy control group comprised 50 female participants and 32 male participants. The mean age was 68.7 (SD 8.9) years in the Parkinson disease group and 65.3 (SD 9.8) years in the healthy control group. Disease severity was assessed using the MDS-UPDRS-III (Movement Disorder Society&#x2013;Sponsored Revision of the Unified Parkinson Disease Rating Scale) score (range 3&#x2010;55; mean 26.7, SD 10.6, and median 25.5, IQR 19.0-34.0). Of these, 192 participants (112 with Parkinson disease and 80 healthy controls) had complete and usable audio recordings across all required speech tasks and were included in model training and evaluation. Eleven participants (9 with Parkinson disease and 2 healthy controls) were excluded due to missing recordings or data quality issues identified in the source dataset. <xref ref-type="table" rid="table1">Table 1</xref> summarizes participant characteristics.</p><p>Recordings were collected in a hospital inspection room using a Samsung Galaxy Tab S7 FE positioned approximately 30 cm from the participant&#x2019;s mouth. Audio was recorded in MP3 format at 48 kHz (32-bit) and converted to WAV format prior to preprocessing. The speech protocol included 2 tasks: sustained vowel phonation of /a/ (Task 1) and reading 20 sentences comprising 10 nonmeaningful and 10 meaningful sentences (Task 2). The 20 sentence-reading items are indexed as test cases 1&#x2010;20; test cases 1&#x2010;10 correspond to nonmeaningful utterances, and test cases 11&#x2010;20 correspond to meaningful sentences.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of study population characteristics.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristic</td><td align="left" valign="bottom">PD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> group (n=121)</td><td align="left" valign="bottom">HC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> group (n=82)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Sex</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female, n</td><td align="left" valign="top">53</td><td align="left" valign="top">50</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male, n</td><td align="left" valign="top">68</td><td align="left" valign="top">32</td></tr><tr><td align="left" valign="top">Age (y), mean (SD)</td><td align="left" valign="top">68.7 (8.9)</td><td align="left" valign="top">65.3 (9.8)</td></tr><tr><td align="left" valign="top" colspan="3">MDS-UPDRS-III<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> score</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Range</td><td align="left" valign="top">3&#x2013;55</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean (SD)</td><td align="left" valign="top">26.7 (10.6)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median (IQR)</td><td align="left" valign="top">25.5 (19.0-34.0)</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>PD: Parkinson disease.</p></fn><fn id="table1fn2"><p><sup>b</sup>HC: healthy control.</p></fn><fn id="table1fn3"><p><sup>c</sup>MDS-UPDRS-III: Movement Disorder Society&#x2013;Sponsored Revision of the Unified Parkinson Disease Rating Scale.</p></fn><fn id="table1fn4"><p><sup>d</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2"><title>Audio Preprocessing</title><p>Preprocessing was performed for 2 components: (1) time-frequency spectrogram representations used as CNN inputs, and (2) RR features used as the recognition-aware component. Each recording was standardized in the time domain (resampling to a fixed sampling rate, mono conversion, and conservative trimming), converted into 3 time-frequency views (Mel, CQT, and STFT), log-compressed, normalized using training-fold statistics, and resized to 128&#x00D7;128 grayscale images for network input. Detailed spectrogram normalization equations are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. All normalization parameters were estimated using only the training subset within each fold and applied unchanged to the validation and test subsets to prevent data leakage.</p><p>Audio files were loaded from WAV format and processed using librosa [<xref ref-type="bibr" rid="ref32">32</xref>]. The Mel representation used 128 Mel bands, a fast Fourier transform size of 2048, Hann window length of 2048, and a hop length of 512. The CQT used a hop length of 512, a minimum frequency of approximately 32.7 Hz, 12 bins per octave, and 84 frequency bins. The STFT used a fast Fourier transform size of 2048, Hann window length of 2048, and a hop length of 512. Modality-specific normalization (min-max for Mel, <italic>z</italic>-score for CQT, and robust scaling for STFT) was applied fold-wise using training statistics only (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>To obtain the recognition-aware feature, we used RR, which provides an intuitive measure of how well an automatic speech recognition system understands the participant&#x2019;s speech. Higher RR values indicate clearer and more intelligible speech, whereas lower values reflect reduced speech clarity.</p><p>Speech intelligibility is frequently compromised in individuals with Parkinson disease due to the progressive deterioration of motor control underlying speech production [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. This deterioration often results in decreased articulatory precision, particularly evident when producing phonetically complex utterances that demand fine-grained neuromuscular coordination [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. To capture this clinically relevant dimension of speech, we incorporate RR as a complementary feature. The sentence-reading task used in this study was designed to encompass both contextually meaningful sentences and phonetically demanding word sequences, providing a suitable basis for RR to reflect the degree of articulatory impairment.</p><p>The RR was computed from the sentence-reading task by comparing the target transcript with the automatic speech recognition output using a normalized edit distance:</p><disp-formula id="E1"> <label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">R</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">%</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mn>100</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">d</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">g</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">t</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">g</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">t</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">R</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">%</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>100</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>d</italic><sub>edit</sub>(&#x00B7;<italic>,</italic>&#x00B7;) is the Levenshtein distance and |<italic>y</italic><sub>target</sub>| is the number of characters in the target string. The RR is computed independently for each recording using only its corresponding speech input and target transcript, without using any information from other participants. Therefore, RR extraction does not introduce any data leakage between training and test sets. All normalization parameters for RR features are estimated using only the training subset within each fold and applied unchanged to validation and test subsets, consistent with the approach used for spectrogram normalization. RR was computed for each of the 20 test cases, and the resulting 20 values were concatenated to form a fixed-length feature vector per subject.</p></sec><sec id="s2-3"><title>Model Architecture</title><p>To accurately classify Parkinson disease through recorded voice, we propose the multiview spectrogram recognition-aware Parkinson detection network (MSR-PDNet), which combines a multibranch CNN with a recognition-aware component derived from the RR. As illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>, MSR-PDNet receives 3 normalized spectrogram images (Mel, CQT, and STFT) as parallel inputs. Each branch applies stacked 2-dimensional convolutional blocks (convolution, batch normalization, nonlinear activation, and pooling), followed by global pooling to produce a fixed-length embedding vector. The 3 embedding vectors are concatenated into a fused multispectrogram representation, followed by batch normalization and a compact classification head. The output uses a single sigmoid activation to produce the Parkinson disease probability <italic>P</italic>(PD | <italic>x</italic>) &#x2208; [0<italic>,</italic> 1]. In the full MSR-PDNet model, the RR feature vector is concatenated with the fused spectrogram representation before the classification head, providing lightweight recording-level intelligibility context that complements local spectro-temporal patterns. An ablated variant without RR was also trained under the same protocol.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Complete architecture of the proposed multiview spectrogram recognition-aware Parkinson detection network. CNN: convolutional neural network; CQT: constant-Q transform; HC: healthy control; L2 reg: L2 regularization; MaxPool: max pooling; PD: Parkinson disease; ReLU: rectified linear unit; RR: recognition ratio; Stat. Sig.: statistically significant; STFT: short-time Fourier transform.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e94063_fig01.png"/></fig><p>Models were implemented using TensorFlow and Keras (v2.19.0) and trained with the Adam optimizer at a learning rate of <italic>&#x03B7;</italic>=1&#x00D7;10<italic><sup>&#x2212;</sup></italic><sup>4</sup>, binary cross-entropy loss, batch size of 16, and dropout of 0.3, for up to 40 epochs. Early stopping required a minimum of 20 epochs and stopped if the validation loss did not improve for 5 consecutive epochs.</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>Voice recordings were obtained as part of a prospective clinical study conducted at the Department of Neurology, Inha University Hospital (Incheon, South Korea). The study protocol was approved by the Institutional Review Board of Inha University Hospital (IRB 2022-09-037). All experiments involving human participants were performed in accordance with relevant guidelines and regulations, and written informed consent was obtained from all participants and their legal guardians prior to participation. All collected data were deidentified before analysis. Participants were assigned unique study identification numbers, and only initials and study identification numbers were used during data collection and management. No compensation was provided to participants for their participation in this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>This section reports comparative results for MSR-PDNet against traditional machine learning and single-view CNN baselines, together with ablations over spectrogram views and the RR feature.</p><sec id="s3-1"><title>Experimental Setup</title><p>All models were evaluated using the same 5-fold participant-wise cross-validation protocol. In each fold, all spectrograms and RR feature vectors derived from a given participant were assigned exclusively to either the training or the held-out test set (no participant contributed data to both). For each fold, 80% of participants were used for training and 20% were held out for testing. Within each fold, class imbalance was addressed by bootstrap oversampling of the minority class in the training split only; the held-out test split remained unchanged. All experiments were executed on an Apple Silicon workstation (Apple M4).</p></sec><sec id="s3-2"><title>Performance Metrics</title><p>Performance was evaluated on the held-out 20% test split using accuracy, precision, recall (sensitivity), <italic>F</italic><sub>1</sub>-score, and the area under the receiver operating characteristic curve. Parkinson disease was treated as the positive class with a decision threshold of 0.5. Standard confusion matrix counts (true positive [TP], true negative [TN], false positive [FP], false negative [FN]) were used to compute accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">A</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">y</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E5"><label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mtext>&#x00A0;</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>The false negative rate (FNR) was defined as FNR=1&#x2212;Recall. Receiver operating characteristic curves were computed by sweeping the decision threshold, and the area under the curve was averaged across folds. All performance metrics are reported as mean (SD) across the 5-fold participant-wise cross-validation to reflect variability across folds.</p></sec><sec id="s3-3"><title>Comparison With Traditional Acoustic Feature&#x2013;Based Models</title><p>A reference baseline was established using traditional acoustic feature&#x2013;based machine learning models trained on acoustic descriptors (pitch and loudness statistics, jitter, shimmer, Mel-frequency cepstral coefficient bands 1&#x2010;4, and duration) under the same 5-fold participant-wise cross-validation protocol. Mean test accuracy ranged from 59.28% (SD 2.39% for logistic regression) to 68.83% (SD 3.47% for gradient boosting; <xref ref-type="table" rid="table2">Table 2</xref>; <xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Traditional machine learning baselines trained on acoustic features: mean (SD) test accuracy under 5-fold cross-validation.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">Test accuracy (%), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Logistic regression</td><td align="left" valign="top">59.28 (2.39)</td></tr><tr><td align="left" valign="top">Decision tree</td><td align="left" valign="top">62.83 (5.26)</td></tr><tr><td align="left" valign="top">MLP<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">63.32 (4.82)</td></tr><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> (RBF<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> kernel)</td><td align="left" valign="top">65.35 (2.43)</td></tr><tr><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">65.81 (5.64)</td></tr><tr><td align="left" valign="top">Random forest</td><td align="left" valign="top">66.83 (4.10)</td></tr><tr><td align="left" valign="top">Gradient boosting</td><td align="left" valign="top">68.83 (3.47)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Models are sorted in ascending order of mean test accuracy.</p></fn><fn id="table2fn2"><p><sup>b</sup>MLP: multilayer perceptron.</p></fn><fn id="table2fn3"><p><sup>c</sup>SVM: support vector machine.</p></fn><fn id="table2fn4"><p><sup>d</sup>RBF: radial basis function.</p></fn><fn id="table2fn5"><p><sup>e</sup>XGBoost: Extreme Gradient Boosting.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Mean test accuracy comparison between traditional machine learning baselines (acoustic features) and multiview spectrogram recognition-aware Parkinson detection network (with and without RR) under 5-fold cross-validation. DT: decision tree; GB: gradient boosting; LR: logistic regression; MLP: multilayer perceptron; MSR-PDNet: multiview spectrogram recognition-aware Parkinson detection network; RF: random forest; RR: recognition ratio; SVM: support vector machine; XGB: Extreme Gradient BoostIng.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e94063_fig02.png"/></fig><p>The multilayer perceptron achieved 63.32% (SD 4.82%) mean test accuracy, representing the best-performing configuration under the experimental setup. Key hyperparameters, including the number of hidden layers, neurons per layer, learning rate, activation function, and dropout rate, were systematically tuned using validation performance within each fold. Despite this optimization, the performance of the multilayer perceptron remained lower than that of convolutional models, which may reflect differences in inductive bias. Convolutional architectures are better suited to exploit spatial structure in spectrogram representations, whereas the multilayer perceptron operates on flattened feature vectors.</p><p>MSR-PDNet achieved mean test accuracy of 86.9% (SD 25.2%) without RR and 97.4% (SD 5.7%) with RR (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The 97.4% result is 38.12 percentage points higher than logistic regression (mean 59.28%, SD 2.39%) and 28.57 percentage points higher than gradient boosting (mean 68.83%, SD 3.47%).</p></sec><sec id="s3-4"><title>Ablation Study on Spectrogram Representations</title><p>Ablation results quantify the impact of individual spectrogram views, 2-view fusion, and the RR on classification performance. Single-branch CNN baselines achieved mean test accuracies of 82.3% (SD 13.7% for STFT-only), 80% (SD 16.6% for Mel-only), and 76.9% (SD 11.6% for CQT-only; <xref ref-type="table" rid="table3">Table 3</xref>; <xref ref-type="fig" rid="figure3">Figure 3</xref>). The 3-branch fusion model achieved 86.9% (SD 25.2%) mean test accuracy using spectrograms alone, which is 4.6 percentage points higher than the best single-branch baseline. Incorporating the RR increased accuracy to 97.4% (SD 5.7%; +10.5 percentage points).</p><p>Two-branch ablations yielded 65.6% (SD 5.6%, without CQT; Mel and STFT), 61.0% (SD, 8.9%, without STFT; Mel and CQT), and 60.9% (SD 14.5%, without Mel; CQT and STFT; <xref ref-type="table" rid="table4">Table 4</xref>; <xref ref-type="fig" rid="figure4">Figure 4</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Single-branch convolutional neural network baselines and MSR-PDNet variants under 5-fold cross-validation. Results are reported as mean (SD).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy (%), mean (SD)</td><td align="left" valign="bottom">Precision (%), mean (SD)</td><td align="left" valign="bottom">Recall (%), mean (SD)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (%), mean (SD)</td><td align="left" valign="bottom">ROC-AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> (%), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">CQT<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>-only</td><td align="left" valign="top">76.9 (11.6)</td><td align="left" valign="top">85.7 (10)</td><td align="left" valign="top">74.6 (22.2)</td><td align="left" valign="top">77.7 (13.6)</td><td align="left" valign="top">83.2 (8.2)</td></tr><tr><td align="left" valign="top">Mel-only</td><td align="left" valign="top">80 (16.6)</td><td align="left" valign="top">88.1 (7.2)</td><td align="left" valign="top">74.4 (25.1)</td><td align="left" valign="top">79.5 (18.1)</td><td align="left" valign="top">81.9 (12)</td></tr><tr><td align="left" valign="top">STFT<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>-only</td><td align="left" valign="top">82.3 (13.7)</td><td align="left" valign="top">86.9 (8.8)</td><td align="left" valign="top">81 (20.7)</td><td align="left" valign="top">83.1 (14.3)</td><td align="left" valign="top">83.7 (11.8)</td></tr><tr><td align="left" valign="top">MSR-PDNet<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> (spectrograms only)</td><td align="left" valign="top">86.9 (25.2)</td><td align="left" valign="top">80 (44.7)</td><td align="left" valign="top">77.4 (43.3)</td><td align="left" valign="top">78.6 (42.9)</td><td align="left" valign="top">100 (0.1)</td></tr><tr><td align="left" valign="top">MSR-PDNet (with recognition ratio)</td><td align="left" valign="top">97.4 (5.7)</td><td align="left" valign="top">99 (2.2)</td><td align="left" valign="top">96.5 (7.8)</td><td align="left" valign="top">97.7 (5.2)</td><td align="left" valign="top">98.5 (3.3)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ROC-AUC: receiver operating characteristic curve-area under the curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>CQT: constant-Q transform.</p></fn><fn id="table3fn3"><p><sup>c</sup>STFT: short-time Fourier transform.</p></fn><fn id="table3fn4"><p><sup>d</sup>MSR-PDNet: multiview spectrogram recognition-aware Parkinson detection network.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Ablation: mean test accuracy of single-branch baselines and MSR-PDNet (with and without recognition ratio) under 5-fold cross-validation. CQT: constant-Q transform; MSR-PDNet: multiview spectrogram recognition-aware Parkinson detection network; RR: recognition ratio; STFT: short-time Fourier transform.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e94063_fig03.png"/></fig><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Two-branch ablations in comparison with MSR-PDNet under 5-fold cross-validation.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy (%), mean (SD)</td><td align="left" valign="bottom">Precision (%), mean (SD)</td><td align="left" valign="bottom">Recall (%), mean (SD)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (%), mean (SD)</td><td align="left" valign="bottom">ROC-AUC<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> (%), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Without CQT<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> (Mel and STFT<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup>)</td><td align="left" valign="top">65.6 (5.6)</td><td align="left" valign="top">85.7 (13.0)</td><td align="left" valign="top">51.9 (12.0)</td><td align="left" valign="top">63.2 (8.2)</td><td align="left" valign="top">84.8 (11.9)</td></tr><tr><td align="left" valign="top">Without Mel (CQT and STFT)</td><td align="left" valign="top">60.9 (14.5)</td><td align="left" valign="top">94.9 (5.0)</td><td align="left" valign="top">35.5 (27.6)</td><td align="left" valign="top">46.6 (28.6)</td><td align="left" valign="top">88.7 (10.5)</td></tr><tr><td align="left" valign="top">Without STFT (Mel and CQT)</td><td align="left" valign="top">61.0 (8.9)</td><td align="left" valign="top">79.8 (10.4)</td><td align="left" valign="top">45.0 (12.9)</td><td align="left" valign="top">56.7 (11.8)</td><td align="left" valign="top">76.9 (14.0)</td></tr><tr><td align="left" valign="top">MSR-PDNet<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup> (with recognition ratio)</td><td align="left" valign="top">97.4 (5.7)</td><td align="left" valign="top">99.0 (2.2)</td><td align="left" valign="top">96.5 (7.8)</td><td align="left" valign="top">97.7 (5.2)</td><td align="left" valign="top">98.5 (3.3)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>ROC-AUC: receiver operating characteristic curve-area under the curve.</p></fn><fn id="table4fn2"><p><sup>b</sup>CQT: constant-Q transform.</p></fn><fn id="table4fn3"><p><sup>c</sup>STFT: short-time Fourier transform.</p></fn><fn id="table4fn4"><p><sup>d</sup>MSR-PDNet: multiview spectrogram recognition-aware Parkinson detection network.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Ablation: mean test accuracy of 2-branch fusion variants and MSR-PDNet (with or without RR) under 5-fold cross-validation. CQT: constant-Q transform; MSR-PDNet: multiview spectrogram recognition-aware Parkinson detection network; RR: recognition ratio; STFT: short-time Fourier transform.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e94063_fig04.png"/></fig><p>Notably, the spectrogram-only MSR-PDNet configuration showed substantially higher variability across folds (mean 86.9%, SD 25.2%) than the RR-augmented model (mean 97.4%, SD 5.7%), indicating that the recognition-aware feature improves not only accuracy but also stability under participant-wise cross-validation.</p></sec><sec id="s3-5"><title>Effect of Recognition-Aware Information on Parkinson Disease Classification</title><p>To examine whether RR differs between groups, the Parkinson disease and healthy control groups were compared using Welch 2-tailed <italic>t</italic> test independently for each of the 20 test cases. Five test cases showed statistically significant differences at <italic>P</italic>&#x003C;.05: test cases 2, 10, and 18&#x2010;20 (<xref ref-type="table" rid="table5">Table 5</xref>). Two test cases (10 and 19) remained significant after Bonferroni correction. Concentrating on significant test cases increased the Dunn index from 0.0570 (all test cases) to 0.1640 (significant test cases only), corresponding to a 2.88&#x00D7; increase in cluster separability (<xref ref-type="fig" rid="figure5">Figure 5</xref>; <xref ref-type="fig" rid="figure6">Figure 6</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Recognition ratio test cases showing statistically significant differences between Parkinson disease and healthy control groups.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Test case</td><td align="left" valign="bottom"><italic>t</italic> test (<italic>df</italic>)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Test case 2</td><td align="left" valign="top">&#x2212;2.103 (162.7)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top">Test case 10</td><td align="left" valign="top">&#x2212;4.411 (187.9)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Test case 18</td><td align="left" valign="top">&#x2212;2.798 (182.2)</td><td align="left" valign="top">.006</td></tr><tr><td align="left" valign="top">Test case 19</td><td align="left" valign="top">&#x2212;3.988 (161.9)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Test case 20</td><td align="left" valign="top">&#x2212;2.366 (185.0)</td><td align="left" valign="top">.02</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>A 2-tailed Welch <italic>t</italic> test was used. A negative <italic>t</italic> indicates a lower mean recognition ratio in the Parkinson disease group. Nominal significance: <italic>P</italic>&#x003C;.05.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Recognition ratio heatmaps. (A) All test cases and (B) statistically significant test cases (<italic>P</italic>&#x003C;.05; 5/20). Top annotation indicates Parkinson disease (green) and healthy controls (orange). PD: Parkinson disease.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e94063_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p><italic>t</italic> SNE visualizations of the recognition ratio feature space. (A) All test cases and (B) statistically significant test cases. <italic>t</italic> SNE: <italic>t</italic>-distributed stochastic neighbor embedding. PD: Parkinson disease.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e94063_fig06.png"/></fig><p>Incorporating RR increased the mean test accuracy from 86.9% (SD 25.2%) to 97.4% (SD 5.7%; <xref ref-type="table" rid="table3">Table 3</xref>). The mean FNR decreased from 0.226 to 0.035, yielding an approximate type II error reduction:</p><disp-formula id="E7"><label>(6)</label><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi mathvariant="normal">&#x0394;</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">y</mml:mi><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">I</mml:mi><mml:mi mathvariant="normal">I</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">%</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi><mml:msub><mml:mi mathvariant="normal">R</mml:mi><mml:mrow><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">R</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi><mml:msub><mml:mi mathvariant="normal">R</mml:mi><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">R</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi><mml:msub><mml:mi mathvariant="normal">R</mml:mi><mml:mrow><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">R</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mn>100</mml:mn><mml:mo>&#x2248;</mml:mo><mml:mn>84.5</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Aggregated confusion matrices are shown in <xref ref-type="fig" rid="figure7">Figure 7</xref>.</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Aggregated 5-fold test confusion matrices for multiview spectrogram recognition-aware Parkinson detection network without RR (A) and with RR (B). Matrices reflect 192 of 203 enrolled participants evaluated at test time (112 with PD and 80 HCs). Eleven participants with missing or unusable recordings were excluded prior to training. Row percentages are computed from pooled counts across all 5 folds. HC: healthy control; PD: Parkinson disease; RR: recognition ratio.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e94063_fig07.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>MSR-PDNet integrates 3 complementary spectrogram representations (Mel, STFT, and CQT) with an RR feature vector derived from the same voice recording to support noninvasive Parkinson disease screening. Under strict participant-wise 5-fold cross-validation, 3 main outcomes were observed: spectrogram-based CNN models outperformed traditional acoustic feature machine learning baselines; 3-view spectrogram fusion improved spectrogram-only performance beyond single-view baselines; and integrating the RR produced the largest gain. Mean accuracy increased from 86.9% (SD 25.2%) to 97.4% (SD 5.7%) with RR (+10.5 percentage points), and mean recall increased from 77.4% (SD 43.3%) to 96.5% (SD 7.8%; +19.1 percentage points), reducing the mean FNR from 0.226 to 0.035.</p><p>Spectrogram-based CNN models achieved higher held-out accuracy than acoustic feature machine learning baselines under the same protocol. The best single-view baseline (STFT only) reached 82.31% (SD 13.69%), whereas the strongest acoustic feature baseline (gradient boosting) reached 68.83% (SD 3.47%), a 13.47 percentage-point difference. This gap is consistent with time-frequency representations preserving local spectro-temporal patterns that are attenuated when recordings are compressed into summary acoustic statistics.</p><p>Three-view spectrogram fusion provided the strongest spectrogram-only performance (mean 86.9%, SD 25.2%), exceeding the best single-view baseline by 4.6 percentage points. Two-view configurations achieved only 60.9% (SD 14.5%) to 65.6% (SD 5.6%), indicating that fusion benefits depend on the specific view combination. Among 2-view configurations, removing Mel features yielded the lowest accuracy, indicating that Mel representations contribute important complementary information consistent with prior Parkinson disease voice classification studies [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>].</p><p>The strong performance of MSR-PDNet (97.4% [SD 5.7%] with RR) likely reflects the complementarity of the proposed feature representation. The 3 spectrogram views (Mel, STFT, and CQT) capture different acoustic aspects of the same voice signal, while the RR provides recording-level information that is not explicitly encoded in local spectrogram patterns. This interpretation is supported by the ablation results: single-view baselines achieved 76.9% (SD 11.6%) to 82.3% (13.7%), 3-view fusion reached 86.9% (SD 25.2%), and adding RR further increased accuracy to 97.4% (SD 5.7%). At the same time, the controlled recording environment, fixed device, and structured reading task may also have contributed to the observed performance.</p><p>From a clinical perspective, the proposed model likely captures speech abnormalities associated with hypokinetic dysarthria, a motor speech disorder affecting up to 90% of individuals with Parkinson disease and characterized by reduced vocal loudness, monotonic pitch, imprecise consonant articulation, and diminished speech intelligibility [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. These impairments arise from the progressive degeneration of dopaminergic pathways that govern laryngeal, respiratory, and articulatory musculature, and they constitute some of the earliest detectable nonmotor biomarkers of the disease [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. The 3 spectrogram branches of MSR-PDNet are architecturally aligned with distinct acoustic manifestations of these clinical symptoms. The STFT branch operates on short-time Fourier representations and is particularly sensitive to rapid spectral fluctuations, including vocal tremor, fundamental frequency (F0) instability, and aperiodic noise components that characterize impaired phonation in Parkinson disease [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. The Mel-scale branch maps frequency content onto a perceptually weighted scale that emphasizes the low-to-mid frequency range, making it well-suited to detecting reduced vocal loudness, disrupted harmonic structure, and the breathy voice quality associated with hypophonia in Parkinson disease [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. The CQT branch provides logarithmic frequency resolution with high spectral precision at lower frequencies, enabling the model to identify monopitch patterns, harmonic distortion, and reduced vowel space that correspond to the monotone speech and imprecise articulation characteristic of hypokinetic dysarthria [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. By fusing these 3 complementary representations, MSR-PDNet captures a broader and more clinically complete acoustic profile of Parkinson disease&#x2013;related vocal impairment than any single-branch spectrogram could encode independently, which is directly consistent with the ablation results, showing that removing any single view substantially reduces performance.</p><p>Beyond vocal spectrograms, the RR feature incorporated in the full MSR-PDNet model reflects a clinically distinct dimension of Parkinson disease pathophysiology: respiratory motor dysfunction. A recent meta-analysis confirmed significantly elevated resting respiratory rates in Parkinson disease patients compared with healthy controls [<xref ref-type="bibr" rid="ref44">44</xref>], and a large-scale clinical study reported a 44% prevalence of respiratory dysfunction in Parkinson disease, attributing it to impaired thoracic musculature control and reduced respiratory drive [<xref ref-type="bibr" rid="ref45">45</xref>]. The RR, derived from sentence-reading performance, encodes both articulatory precision and respiratory support for speech, thereby providing physiological information that is qualitatively complementary to the spectral features captured by the 3 spectrogram branches. This clinical complementarity explains the substantial accuracy gain observed when the RR is added to the spectrogram-only model (mean 86.9%, SD 25.2% to mean 97.4%, SD 5.7%): the 2-feature modalities jointly reflect the vocal tract impairment and the respiratory motor impairment that together define hypokinetic dysarthria in Parkinson disease. Therefore, the model&#x2019;s behavior can be interpreted in terms of clinically meaningful manifestations of Parkinson disease&#x2013;related speech impairment rather than purely abstract signal patterns.</p><p>RR integration produced the largest improvement in accuracy and sensitivity. The corresponding FNR decreased from 0.226 to 0.035 (approximately 84.5% reduction), which is relevant for screening-oriented settings where missed cases are clinically undesirable.</p><p>Although the RR may be affected by factors such as accent, language proficiency, and recording quality, several findings support its relevance to Parkinson disease&#x2013;related speech changes. RR differences were observed between relatively age-matched Parkinson disease and healthy control groups (mean age 68.7 [SD 8.9] y vs 65.3 [SD 9.8] y), significant differences were identified across specific sentence-reading tasks (<xref ref-type="table" rid="table5">Table 5</xref>), and controlled acquisition conditions reduced recording-related variability. Therefore, although RR should not be regarded as a fully disease-specific biomarker in isolation, it appears to provide a meaningful recognition-aware feature reflecting Parkinson disease speech characteristics.</p><p>Recent state-of-the-art voice-based Parkinson disease classification studies were reviewed alongside MSR-PDNet, as summarized in <xref ref-type="table" rid="table6">Table 6</xref>. Given that each study adopts its own dataset, feature extraction approach, and evaluation protocol, the figures presented here serve as a broad reference point rather than a strict performance benchmark.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Comparison with recent state-of-the-art voice-based Parkinson disease classification methods, including MSR-PDNet (spectrogram only) and MSR-PDNet (spectrogram+RR).</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Type</td><td align="left" valign="bottom">Accuracy (%)</td><td align="left" valign="bottom">Reference</td></tr></thead><tbody><tr><td align="left" valign="top">Vision transformer (ViT+AST<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>)</td><td align="left" valign="top">Single</td><td align="left" valign="top">73</td><td align="left" valign="top">Perrone et al [<xref ref-type="bibr" rid="ref46">46</xref>]</td></tr><tr><td align="left" valign="top">ResNet CNN<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td><td align="left" valign="top">Single</td><td align="left" valign="top">84</td><td align="left" valign="top">Escobar-Grisales et al [<xref ref-type="bibr" rid="ref47">47</xref>]</td></tr><tr><td align="left" valign="top">EfficientNet-B2</td><td align="left" valign="top">Multi</td><td align="left" valign="top">84.39</td><td align="left" valign="top">Malekroodi et al [<xref ref-type="bibr" rid="ref48">48</xref>]</td></tr><tr><td align="left" valign="top">MSR-PDNet<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup> (spectrogram only)</td><td align="left" valign="top">Multi</td><td align="left" valign="top">86.9</td><td align="left" valign="top">Ours</td></tr><tr><td align="left" valign="top">DenseNet-161 (TL<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup>)</td><td align="left" valign="top">Single</td><td align="left" valign="top">89.75</td><td align="left" valign="top">Karaman et al [<xref ref-type="bibr" rid="ref49">49</xref>]</td></tr><tr><td align="left" valign="top">VGG-16<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup></td><td align="left" valign="top">Single</td><td align="left" valign="top">91.8</td><td align="left" valign="top">Malekroodi et al [<xref ref-type="bibr" rid="ref50">50</xref>]</td></tr><tr><td align="left" valign="top">VGG-16</td><td align="left" valign="top">Single</td><td align="left" valign="top">92</td><td align="left" valign="top">Bhatt et al [<xref ref-type="bibr" rid="ref51">51</xref>]</td></tr><tr><td align="left" valign="top">DenseNet+MobileNet+ShuffleNet</td><td align="left" valign="top">Multi</td><td align="left" valign="top">95.56</td><td align="left" valign="top">Chen et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td></tr><tr><td align="left" valign="top">CNN-LSTM<sup><xref ref-type="table-fn" rid="table6fn6">f</xref></sup></td><td align="left" valign="top">Single</td><td align="left" valign="top">95.67</td><td align="left" valign="top">Shibina and Thasleema [<xref ref-type="bibr" rid="ref52">52</xref>]</td></tr><tr><td align="left" valign="top">MSR-PDNet (spectrogram+RR<sup><xref ref-type="table-fn" rid="table6fn7">g</xref></sup>)</td><td align="left" valign="top">Multi</td><td align="left" valign="top">97.4</td><td align="left" valign="top">Ours</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>AST: audio spectrogram transformer.</p></fn><fn id="table6fn2"><p><sup>b</sup>CNN: convolutional neural network.</p></fn><fn id="table6fn3"><p><sup>c</sup>MSR-PDNet: multiview spectrogram recognition-aware Parkinson detection network.</p></fn><fn id="table6fn4"><p><sup>d</sup>TL: transfer learning.</p></fn><fn id="table6fn5"><p><sup>e</sup>VGG-16: visual geometry group 16.</p></fn><fn id="table6fn6"><p><sup>f</sup>LSTM: long short-term memory.</p></fn><fn id="table6fn7"><p><sup>g</sup>RR: recognition ratio.</p></fn></table-wrap-foot></table-wrap><p>Among single-spectrogram approaches, reported accuracies ranged from 73% to 84%, with several architectures, including DenseNet-161 (89.75% [<xref ref-type="bibr" rid="ref49">49</xref>]), visual geometry group 16 (VGG-16: 91.8% [<xref ref-type="bibr" rid="ref50">50</xref>]), Superlet-based VGG-16 (92% [<xref ref-type="bibr" rid="ref51">51</xref>]), and CNN and long short-term memory (95.67% [<xref ref-type="bibr" rid="ref52">52</xref>]), yielding higher values under their own experimental conditions. MSR-PDNet with RR achieved 97.4%, representing a strong result within this broader landscape, although cross-study variability in datasets and evaluation protocols limits direct interpretability.</p><p>Similarly, multibranch and multimodel configurations such as EfficientNet-B2 fusion (84.39% [<xref ref-type="bibr" rid="ref48">48</xref>]) and ensemble models (95.56% [<xref ref-type="bibr" rid="ref26">26</xref>]) reflect the growing interest in combining multiple representations, which aligns with the motivation behind integrating multiview spectrograms with RR features in the proposed approach.</p><p>Despite these promising results, several limitations should be noted. First, evaluation was performed on a single-center dataset collected under controlled conditions using the same recording device and protocol. Accordingly, performance is expected to generalize best to similar clinical settings, whereas different cohorts, devices, and real-world environments may introduce distribution shifts that affect both spectrogram features and RR computation. Future work should therefore include multicenter validation, noise-aware data augmentation, and domain adaptation to improve robustness.</p><p>Second, bootstrap oversampling was applied in the training split to address class imbalance; further evaluation of probability calibration would be beneficial for clinically meaningful sensitivity-specificity tradeoffs.</p><p>Third, RR depends on the speech task design and the automatic speech recognition pipeline, and it may vary across languages, prompts, and recognition systems.</p><p>Fourth, the current system requires a standardized sentence-reading task with predefined prompts to compute the RR, making it most suitable for supervised clinical or at-home screening. Performance may be affected by background noise, device variability, and shorter recordings, and adaptation would be needed for spontaneous-speech settings. Furthermore, the effectiveness of the RR is likely sensitive to the phonetic and articulatory characteristics of the selected prompts. While the current 20-sentence protocol contributed to improved classification performance, systematically optimized sentence designs may further enhance the ability to capture Parkinson disease&#x2013;related speech impairments. Future work should therefore evaluate robustness under diverse recording conditions, explore more flexible speech protocols, and investigate prompt optimization strategies to maximize the diagnostic sensitivity of the RR feature.</p><p>Finally, the low accuracy of 2-view configurations indicates that fusion benefits depend strongly on view pairing and training stability, and broader testing across datasets and alternative fusion strategies is warranted.</p></sec><sec id="s4-2"><title>Conclusions</title><p>This study presents MSR-PDNet, a multiview spectrogram-based framework for noninvasive Parkinson disease screening from voice recordings. Across participant-wise 5-fold cross-validation, the method achieved 86.9% (SD 25.2%) mean test accuracy using spectrogram fusion, improving to 97.4% (SD 5.7%) when the RR vector was added. The RR-augmented model showed higher sensitivity (recall: mean 96.5%, SD 7.8%) compared with the spectrogram-only configuration (mean 77.4%, SD 43.3%), with a substantially reduced FNR relevant for screening. Comparison with recent state-of-the-art methods suggested that MSR-PDNet is competitive within the current literature. Future work will focus on external validation, robustness testing across multidevice and multicenter cohorts, and adaptation to more flexible recording conditions to support clinical translation.</p></sec></sec></body><back><ack><p>Artificial intelligence&#x2013;assisted tools were used solely for grammar and language checking during manuscript preparation. The authors take full responsibility for all content.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the National Research Foundation of Korea grant funded by the Korean government (MSIT; RS-2023-00222406; RS-2026-25483035).</p></sec><sec><title>Data Availability</title><p>The data that support the findings of this study are not publicly available due to ethical and privacy restrictions involving human participants but are available from the corresponding author upon reasonable request and subject to institutional approval.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb2">CQT</term><def><p>constant-Q transform</p></def></def-item><def-item><term id="abb3">FN</term><def><p>false negative</p></def></def-item><def-item><term id="abb4">FNR</term><def><p>false negative rate</p></def></def-item><def-item><term id="abb5">FP</term><def><p>false positive</p></def></def-item><def-item><term id="abb6">MSR-PDNet</term><def><p>multiview spectrogram recognition-aware Parkinson detection network</p></def></def-item><def-item><term id="abb7">RR</term><def><p>recognition ratio</p></def></def-item><def-item><term id="abb8">STFT</term><def><p>short-time Fourier transform</p></def></def-item><def-item><term id="abb9">TN</term><def><p>true negative</p></def></def-item><def-item><term id="abb10">TP</term><def><p>true positive</p></def></def-item><def-item><term id="abb11">VGG-16</term><def><p>visual geometry group 16</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jankovic</surname><given-names>J</given-names> </name></person-group><article-title>Parkinson&#x2019;s disease: clinical features and diagnosis</article-title><source>J Neurol Neurosurg Psychiatry</source><year>2008</year><month>04</month><volume>79</volume><issue>4</issue><fpage>368</fpage><lpage>376</lpage><pub-id pub-id-type="doi">10.1136/jnnp.2007.131045</pub-id><pub-id pub-id-type="medline">18344392</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsanas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Little</surname><given-names>MA</given-names> </name><name name-style="western"><surname>McSharry</surname><given-names>PE</given-names> </name><name name-style="western"><surname>Ramig</surname><given-names>LO</given-names> </name></person-group><article-title>Accurate telemonitoring of Parkinson&#x2019;s disease progression by noninvasive speech tests</article-title><source>IEEE Trans Biomed Eng</source><year>2010</year><month>04</month><volume>57</volume><issue>4</issue><fpage>884</fpage><lpage>893</lpage><pub-id pub-id-type="doi">10.1109/TBME.2009.2036000</pub-id><pub-id pub-id-type="medline">19932995</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sakar</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Isenkul</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Sakar</surname><given-names>CO</given-names> </name><etal/></person-group><article-title>Collection and analysis of a Parkinson speech dataset with multiple types of sound recordings</article-title><source>IEEE J Biomed Health Inform</source><year>2013</year><month>07</month><volume>17</volume><issue>4</issue><fpage>828</fpage><lpage>834</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2013.2245674</pub-id><pub-id pub-id-type="medline">25055311</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>G&#x00FC;r&#x00FC;ler</surname><given-names>H</given-names> </name></person-group><article-title>A novel diagnosis system for Parkinson&#x2019;s disease using complex-valued artificial neural network with k-means clustering feature weighting method</article-title><source>Neural Comput Applic</source><year>2017</year><month>07</month><volume>28</volume><issue>7</issue><fpage>1657</fpage><lpage>1666</lpage><pub-id pub-id-type="doi">10.1007/s00521-015-2142-2</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peker</surname><given-names>M</given-names> </name></person-group><article-title>A decision support system to improve medical diagnosis using a combination of k-medoids clustering based attribute weighting and SVM</article-title><source>J Med Syst</source><year>2016</year><month>05</month><volume>40</volume><issue>5</issue><fpage>116</fpage><pub-id pub-id-type="doi">10.1007/s10916-016-0477-6</pub-id><pub-id pub-id-type="medline">27000777</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Erdogdu Sakar</surname><given-names>B</given-names> </name><name name-style="western"><surname>Serbes</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sakar</surname><given-names>CO</given-names> </name></person-group><article-title>Analyzing the effectiveness of vocal features in early telediagnosis of Parkinson&#x2019;s disease</article-title><source>PLoS One</source><year>2017</year><volume>12</volume><issue>8</issue><fpage>e0182428</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0182428</pub-id><pub-id pub-id-type="medline">28792979</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lau</surname><given-names>KK</given-names> </name><name name-style="western"><surname>Thyagarajan</surname><given-names>D</given-names> </name></person-group><article-title>Voice changes in Parkinson&#x2019;s disease: what are they telling us?</article-title><source>J Clin Neurosci</source><year>2020</year><month>02</month><volume>72</volume><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1016/j.jocn.2019.12.029</pub-id><pub-id pub-id-type="medline">31952969</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goldman</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Volpe</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ellis</surname><given-names>TD</given-names> </name><etal/></person-group><article-title>Delivering multidisciplinary rehabilitation care in Parkinson&#x2019;s disease: an international consensus statement</article-title><source>J Parkinsons Dis</source><year>2024</year><volume>14</volume><issue>1</issue><fpage>135</fpage><lpage>166</lpage><pub-id pub-id-type="doi">10.3233/JPD-230117</pub-id><pub-id pub-id-type="medline">38277303</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x0160;ubert</surname><given-names>M</given-names> </name><name name-style="western"><surname>Novotn&#x00FD;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tykalov&#x00E1;</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Spoken language alterations can predict phenoconversion in isolated rapid eye movement sleep behavior disorder: a multicenter study</article-title><source>Ann Neurol</source><year>2024</year><month>03</month><volume>95</volume><issue>3</issue><fpage>530</fpage><lpage>543</lpage><pub-id pub-id-type="doi">10.1002/ana.26835</pub-id><pub-id pub-id-type="medline">37997483</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeancolas</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mangone</surname><given-names>G</given-names> </name><name name-style="western"><surname>Petrovska-Delacr&#x00E9;taz</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Voice characteristics from isolated rapid eye movement sleep behavior disorder to early Parkinson&#x2019;s disease</article-title><source>Parkinsonism Relat Disord</source><year>2022</year><month>02</month><volume>95</volume><fpage>86</fpage><lpage>91</lpage><pub-id pub-id-type="doi">10.1016/j.parkreldis.2022.01.003</pub-id><pub-id pub-id-type="medline">35063866</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gouda</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Elkamhawy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>J</given-names> </name></person-group><article-title>Emerging therapeutic strategies for Parkinson&#x2019;s disease and future prospects: a 2021 update</article-title><source>Biomedicines</source><year>2022</year><month>02</month><day>3</day><volume>10</volume><issue>2</issue><fpage>371</fpage><pub-id pub-id-type="doi">10.3390/biomedicines10020371</pub-id><pub-id pub-id-type="medline">35203580</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ash</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jester</surname><given-names>C</given-names> </name><name name-style="western"><surname>York</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Longitudinal decline in speech production in Parkinson&#x2019;s disease spectrum disorders</article-title><source>Brain Lang</source><year>2017</year><month>08</month><volume>171</volume><fpage>42</fpage><lpage>51</lpage><pub-id pub-id-type="doi">10.1016/j.bandl.2017.05.001</pub-id><pub-id pub-id-type="medline">28527315</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Daoudi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Das</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tykalova</surname><given-names>T</given-names> </name><name name-style="western"><surname>Klempir</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rusz</surname><given-names>J</given-names> </name></person-group><article-title>Speech acoustic indices for differential diagnosis between Parkinson&#x2019;s disease, multiple system atrophy and progressive supranuclear palsy</article-title><source>NPJ Parkinsons Dis</source><year>2022</year><month>10</month><day>27</day><volume>8</volume><issue>1</issue><fpage>142</fpage><pub-id pub-id-type="doi">10.1038/s41531-022-00389-6</pub-id><pub-id pub-id-type="medline">36302780</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Nunes</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Utilizing speech analysis to differentiate progressive supranuclear palsy from Parkinson&#x2019;s disease</article-title><source>Parkinsonism Relat Disord</source><year>2023</year><month>10</month><volume>115</volume><fpage>105835</fpage><pub-id pub-id-type="doi">10.1016/j.parkreldis.2023.105835</pub-id><pub-id pub-id-type="medline">37678101</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rusz</surname><given-names>J</given-names> </name><name name-style="western"><surname>Krupi&#x010D;ka</surname><given-names>R</given-names> </name><name name-style="western"><surname>V&#x00ED;te&#x010D;kov&#x00E1;</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Speech and gait abnormalities in motor subtypes of de-novo Parkinson&#x2019;s disease</article-title><source>CNS Neurosci Ther</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>2101</fpage><lpage>2110</lpage><pub-id pub-id-type="doi">10.1111/cns.14158</pub-id><pub-id pub-id-type="medline">36942517</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Mohammed</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Al-Naji</surname><given-names>A</given-names> </name></person-group><article-title>Parkinson&#x2019;s disease detection from voice using artificial intelligence techniques: a review</article-title><conf-name>The Fifth Scientific Conference for Electrical Engineering Techniques Research (EETR2024)</conf-name><conf-date>Jun 15-16, 2024</conf-date><pub-id pub-id-type="doi">10.1063/5.0236188</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Non-invasive detection of Parkinson&#x2019;s disease based on speech analysis and interpretable machine learning</article-title><source>Front Aging Neurosci</source><year>2025</year><volume>17</volume><fpage>1586273</fpage><pub-id pub-id-type="doi">10.3389/fnagi.2025.1586273</pub-id><pub-id pub-id-type="medline">40370753</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>W</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name></person-group><article-title>Multi-label speech feature selection for Parkinson&#x2019;s disease subtype recognition using graph model</article-title><source>Comput Biol Med</source><year>2025</year><month>02</month><volume>185</volume><fpage>109566</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.109566</pub-id><pub-id pub-id-type="medline">39719792</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sedigh Malekroodi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Madusanka</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>BI</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>M</given-names> </name></person-group><article-title>Speech-based Parkinson&#x2019;s detection using pre-trained self-supervised automatic speech recognition (ASR) models and supervised contrastive learning</article-title><source>Bioengineering (Basel)</source><year>2025</year><month>07</month><day>1</day><volume>12</volume><issue>7</issue><fpage>728</fpage><pub-id pub-id-type="doi">10.3390/bioengineering12070728</pub-id><pub-id pub-id-type="medline">40722419</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guatelli</surname><given-names>R</given-names> </name><name name-style="western"><surname>Aubin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Mora</surname><given-names>M</given-names> </name><name name-style="western"><surname>Naranjo-Torres</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mora-Olivari</surname><given-names>A</given-names> </name></person-group><article-title>Detection of Parkinson&#x2019;s disease based on spectrograms of voice recordings and extreme learning machine random weight neural networks</article-title><source>Eng Appl Artif Intell</source><year>2023</year><month>10</month><volume>125</volume><fpage>106700</fpage><pub-id pub-id-type="doi">10.1016/j.engappai.2023.106700</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeong</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HJ</given-names> </name></person-group><article-title>Exploring spectrogram-based audio classification for Parkinson&#x2019;s disease: a study on speech classification and qualitative reliability verification</article-title><source>Sensors (Basel)</source><year>2024</year><month>07</month><day>17</day><volume>24</volume><issue>14</issue><fpage>4625</fpage><pub-id pub-id-type="doi">10.3390/s24144625</pub-id><pub-id pub-id-type="medline">39066023</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quamar</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ambeth Kumar</surname><given-names>VD</given-names> </name><name name-style="western"><surname>Rizwan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bagdasar</surname><given-names>O</given-names> </name><name name-style="western"><surname>Kadar</surname><given-names>M</given-names> </name></person-group><article-title>Voice-based early diagnosis of Parkinson&#x2019;s disease using spectrogram features and AI models</article-title><source>Bioengineering (Basel)</source><year>2025</year><month>09</month><day>29</day><volume>12</volume><issue>10</issue><fpage>1052</fpage><pub-id pub-id-type="doi">10.3390/bioengineering12101052</pub-id><pub-id pub-id-type="medline">41155050</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Q</given-names> </name></person-group><article-title>Acoustic scene classification based on three-dimensional multi-channel feature-correlated deep learning networks</article-title><source>Sci Rep</source><year>2022</year><month>08</month><day>12</day><volume>12</volume><issue>1</issue><fpage>13730</fpage><pub-id pub-id-type="doi">10.1038/s41598-022-17863-z</pub-id><pub-id pub-id-type="medline">35962021</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>E</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>G</given-names> </name></person-group><article-title>CycleGuardian: a framework for automatic respiratory sound classification based on improved deep clustering and contrastive learning</article-title><source>Complex Intell Syst</source><year>2025</year><month>04</month><volume>11</volume><issue>4</issue><pub-id pub-id-type="doi">10.1007/s40747-025-01800-4</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><etal/></person-group><article-title>3D Mel-spectrogram&#x2013;based deep learning for automated multiclass diagnosis of pathological voices</article-title><source>Research Square</source><comment>Preprint posted online on  Oct 28, 2025</comment><pub-id pub-id-type="doi">10.21203/rs.3.rs-7711140/v1</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lv</surname><given-names>R</given-names> </name><name name-style="western"><surname>Du</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Parkinson&#x2019;s disease detection using spectrogram-based multi-model feature fusion networks</article-title><source>Front Neurol</source><year>2025</year><volume>16</volume><fpage>1706317</fpage><pub-id pub-id-type="doi">10.3389/fneur.2025.1706317</pub-id><pub-id pub-id-type="medline">41281573</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Islam</surname><given-names>M</given-names> </name><name name-style="western"><surname>Akter</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hossain</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Dewan</surname><given-names>MAA</given-names> </name></person-group><article-title>PD-Net: Parkinson&#x2019;s disease detection through fusion of two spectral features using attention-based hybrid deep neural network</article-title><source>Information</source><year>2025</year><volume>16</volume><issue>2</issue><fpage>135</fpage><pub-id pub-id-type="doi">10.3390/info16020135</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sedigh Malekroodi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>BI</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>M</given-names> </name></person-group><article-title>Voice-based detection of Parkinson&#x2019;s disease using machine and deep learning approaches: a systematic review</article-title><source>Bioengineering (Basel)</source><year>2025</year><month>11</month><day>20</day><volume>12</volume><issue>11</issue><fpage>1279</fpage><pub-id pub-id-type="doi">10.3390/bioengineering12111279</pub-id><pub-id pub-id-type="medline">41301235</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Digital biomarkers for Parkinson disease: bibliometric analysis and a scoping review of deep learning for freezing of gait</article-title><source>J Med Internet Res</source><year>2025</year><month>05</month><day>20</day><volume>27</volume><fpage>e71560</fpage><pub-id pub-id-type="doi">10.2196/71560</pub-id><pub-id pub-id-type="medline">40392578</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Puri</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Naz</surname><given-names>H</given-names> </name><name name-style="western"><surname>Aich</surname><given-names>S</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gabralla</surname><given-names>LA</given-names> </name></person-group><article-title>Multi-modal deep learning framework for early detection of Parkinson&#x2019;s disease using neurological and physiological data for high-fidelity diagnosis</article-title><source>Sci Rep</source><year>2025</year><month>10</month><day>7</day><volume>15</volume><issue>1</issue><fpage>34835</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-21407-6</pub-id><pub-id pub-id-type="medline">41057513</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>YM</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>ZY</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>YY</given-names> </name><name name-style="western"><surname>Hao</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>CH</given-names> </name></person-group><article-title>Digital biomarkers for precision diagnosis and monitoring in Parkinson&#x2019;s disease</article-title><source>NPJ Digit Med</source><year>2024</year><month>08</month><day>21</day><volume>7</volume><issue>1</issue><fpage>218</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01217-2</pub-id><pub-id pub-id-type="medline">39169258</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sushmitha Saro</surname><given-names>R</given-names> </name><name name-style="western"><surname>Jaya Suriya</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rajakumari</surname><given-names>R</given-names> </name></person-group><article-title>Comprehensive speech emotion recognition system employing multi-layer perceptron (MLP) classifier and librosa feature extraction</article-title><conf-name>2023 International Conference on Sustainable Communication Networks and Application (ICSCNA)</conf-name><conf-date>Nov 15-17, 2023</conf-date><pub-id pub-id-type="doi">10.1109/ICSCNA58489.2023.10370394</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ho</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Iansek</surname><given-names>R</given-names> </name><name name-style="western"><surname>Marigliani</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bradshaw</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Gates</surname><given-names>S</given-names> </name></person-group><article-title>Speech impairment in a large sample of patients with Parkinson&#x2019;s disease</article-title><source>Behav Neurol</source><year>1999</year><month>01</month><day>1</day><volume>11</volume><issue>3</issue><fpage>131</fpage><lpage>137</lpage><pub-id pub-id-type="medline">22387592</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>N</given-names> </name></person-group><article-title>Speech, voice and language in Parkinson&#x2019;s disease: changes and interventions</article-title><source>Neurodegen Dis Manage</source><year>2012</year><month>06</month><volume>2</volume><issue>3</issue><fpage>279</fpage><lpage>289</lpage><pub-id pub-id-type="doi">10.2217/nmt.12.15</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Skodda</surname><given-names>S</given-names> </name><name name-style="western"><surname>Visser</surname><given-names>W</given-names> </name><name name-style="western"><surname>Schlegel</surname><given-names>U</given-names> </name></person-group><article-title>Short- and long-term dopaminergic effects on dysarthria in early Parkinson&#x2019;s disease</article-title><source>J Neural Transm (Vienna)</source><year>2010</year><month>02</month><volume>117</volume><issue>2</issue><fpage>197</fpage><lpage>205</lpage><pub-id pub-id-type="doi">10.1007/s00702-009-0351-5</pub-id><pub-id pub-id-type="medline">20012657</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kent</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>YJ</given-names> </name></person-group><article-title>Toward an acoustic typology of motor speech disorders</article-title><source>Clin Linguist Phon</source><year>2003</year><month>09</month><volume>17</volume><issue>6</issue><fpage>427</fpage><lpage>445</lpage><pub-id pub-id-type="doi">10.1080/0269920031000086248</pub-id><pub-id pub-id-type="medline">14564830</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Er</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Isik</surname><given-names>E</given-names> </name><name name-style="western"><surname>Isik</surname><given-names>I</given-names> </name></person-group><article-title>Parkinson&#x2019;s detection based on combined CNN and LSTM using enhanced speech signals with variational mode decomposition</article-title><source>Biomed Signal Process Control</source><year>2021</year><month>09</month><volume>70</volume><fpage>103006</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2021.103006</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Farag&#x00F3;</surname><given-names>P</given-names> </name><name name-style="western"><surname>&#x0218;tef&#x0103;nig&#x0103;</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Cordo&#x0219;</surname><given-names>CG</given-names> </name><etal/></person-group><article-title>CNN-based identification of Parkinson&#x2019;s disease from continuous speech in noisy environments</article-title><source>Bioengineering (Basel)</source><year>2023</year><month>04</month><day>26</day><volume>10</volume><issue>5</issue><fpage>531</fpage><pub-id pub-id-type="doi">10.3390/bioengineering10050531</pub-id><pub-id pub-id-type="medline">37237601</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rahmatallah</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kemp</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Iyer</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Pre-trained convolutional neural networks identify Parkinson&#x2019;s disease from spectrogram images of voice samples</article-title><source>Sci Rep</source><year>2025</year><month>03</month><day>1</day><volume>15</volume><issue>1</issue><fpage>7337</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-92105-6</pub-id><pub-id pub-id-type="medline">40025201</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atalar</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Oguz</surname><given-names>O</given-names> </name><name name-style="western"><surname>Genc</surname><given-names>G</given-names> </name></person-group><article-title>Hypokinetic dysarthria in Parkinson&#x2019;s disease: a narrative review</article-title><source>Sisli Etfal Hastan Tip Bul</source><year>2023</year><volume>57</volume><issue>2</issue><fpage>163</fpage><lpage>170</lpage><pub-id pub-id-type="doi">10.14744/SEMB.2023.29560</pub-id><pub-id pub-id-type="medline">37899809</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cao</surname><given-names>F</given-names> </name><name name-style="western"><surname>Vogel</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Gharahkhani</surname><given-names>P</given-names> </name><name name-style="western"><surname>Renteria</surname><given-names>ME</given-names> </name></person-group><article-title>Speech and language biomarkers for Parkinson&#x2019;s disease prediction, early diagnosis and progression</article-title><source>NPJ Parkinsons Dis</source><year>2025</year><month>03</month><day>24</day><volume>11</volume><issue>1</issue><fpage>57</fpage><pub-id pub-id-type="doi">10.1038/s41531-025-00913-4</pub-id><pub-id pub-id-type="medline">40128529</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Majda-Zdancewicz</surname><given-names>E</given-names> </name><name name-style="western"><surname>Potulska-Chromik</surname><given-names>A</given-names> </name><name name-style="western"><surname>Monika Nojszewska</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kostera-Pruszczyk</surname><given-names>A</given-names> </name></person-group><article-title>Speech signal analysis in patients with Parkinson&#x2019;s disease, taking into account phonation, articulation, and prosody of speech</article-title><source>Appl Sci</source><year>2024</year><volume>14</volume><issue>23</issue><fpage>11085</fpage><pub-id pub-id-type="doi">10.3390/app142311085</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roland</surname><given-names>V</given-names> </name><name name-style="western"><surname>Huet</surname><given-names>K</given-names> </name><name name-style="western"><surname>Harmegnies</surname><given-names>B</given-names> </name><name name-style="western"><surname>Piccaluga</surname><given-names>M</given-names> </name><name name-style="western"><surname>Verhaegen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Delvaux</surname><given-names>V</given-names> </name></person-group><article-title>Vowel production: a potential speech biomarker for early detection of dysarthria in Parkinson&#x2019;s disease</article-title><source>Front Psychol</source><year>2023</year><volume>14</volume><fpage>1129830</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2023.1129830</pub-id><pub-id pub-id-type="medline">37701868</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McMahon</surname><given-names>L</given-names> </name><name name-style="western"><surname>Blake</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lennon</surname><given-names>O</given-names> </name></person-group><article-title>A systematic review and meta-analysis of respiratory dysfunction in Parkinson&#x2019;s disease</article-title><source>Eur J Neurol</source><year>2023</year><month>05</month><volume>30</volume><issue>5</issue><fpage>1481</fpage><lpage>1504</lpage><pub-id pub-id-type="doi">10.1111/ene.15743</pub-id><pub-id pub-id-type="medline">36779856</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van de Wetering-van Dongen</surname><given-names>VA</given-names> </name><name name-style="western"><surname>Nijkrake</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>van der Wees</surname><given-names>PJ</given-names> </name><etal/></person-group><article-title>Dyspnea and dystussia in Parkinson&#x2019;s disease: patient-reported prevalence and determinants</article-title><source>J Neurol</source><year>2025</year><month>03</month><day>22</day><volume>272</volume><issue>4</issue><fpage>283</fpage><pub-id pub-id-type="doi">10.1007/s00415-025-13008-0</pub-id><pub-id pub-id-type="medline">40121387</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Perrone</surname><given-names>B</given-names> </name><name name-style="western"><surname>Amato</surname><given-names>F</given-names> </name><name name-style="western"><surname>Olmo</surname><given-names>G</given-names> </name></person-group><article-title>Voice classification in Parkinson&#x2019;s disease: a deep learning approach using transformers and error rate metrics</article-title><source>Biomed Signal Process Control</source><year>2026</year><month>03</month><volume>113</volume><fpage>108954</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2025.108954</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Escobar-Grisales</surname><given-names>D</given-names> </name><name name-style="western"><surname>R&#x00ED;os-Urrego</surname><given-names>CD</given-names> </name><name name-style="western"><surname>Orozco-Arroyave</surname><given-names>JR</given-names> </name></person-group><article-title>Deep learning and artificial intelligence applied to model speech and language in Parkinson&#x2019;s disease</article-title><source>Diagnostics (Basel)</source><year>2023</year><month>06</month><day>25</day><volume>13</volume><issue>13</issue><fpage>2163</fpage><pub-id pub-id-type="doi">10.3390/diagnostics13132163</pub-id><pub-id pub-id-type="medline">37443557</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Malekroodi</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Madusanka</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>BI</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>M</given-names> </name></person-group><article-title>Multi-channel spectro-temporal representations for speech-based Parkinson&#x2019;s disease detection</article-title><source>J Imaging</source><year>2025</year><month>10</month><day>1</day><volume>11</volume><issue>10</issue><fpage>341</fpage><pub-id pub-id-type="doi">10.3390/jimaging11100341</pub-id><pub-id pub-id-type="medline">41150017</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Karaman</surname><given-names>O</given-names> </name><name name-style="western"><surname>&#x00C7;ak&#x0131;n</surname><given-names>H</given-names> </name><name name-style="western"><surname>Alhudhaif</surname><given-names>A</given-names> </name><name name-style="western"><surname>Polat</surname><given-names>K</given-names> </name></person-group><article-title>Robust automated Parkinson disease detection based on voice signals with transfer learning</article-title><source>Expert Syst Appl</source><year>2021</year><month>09</month><volume>178</volume><fpage>115013</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2021.115013</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Malekroodi</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Madusanka</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>BI</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>M</given-names> </name></person-group><article-title>Leveraging deep learning for fine-grained categorization of Parkinson&#x2019;s disease progression levels through analysis of vocal acoustic patterns</article-title><source>Bioengineering (Basel)</source><year>2024</year><month>03</month><day>21</day><volume>11</volume><issue>3</issue><fpage>295</fpage><pub-id pub-id-type="doi">10.3390/bioengineering11030295</pub-id><pub-id pub-id-type="medline">38534569</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhatt</surname><given-names>K</given-names> </name><name name-style="western"><surname>Jayanthi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>M</given-names> </name></person-group><article-title>High-resolution superlet transform based techniques for Parkinson&#x2019;s disease detection using speech signal</article-title><source>Appl Acoust</source><year>2023</year><month>11</month><volume>214</volume><fpage>109657</fpage><pub-id pub-id-type="doi">10.1016/j.apacoust.2023.109657</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shibina</surname><given-names>V</given-names> </name><name name-style="western"><surname>Thasleema</surname><given-names>TM</given-names> </name></person-group><article-title>A hybrid approach to detecting Parkinson&#x2019;s disease using spectrogram and deep learning CNN-LSTM network</article-title><source>Int J Speech Technol</source><year>2024</year><month>09</month><volume>27</volume><issue>3</issue><fpage>657</fpage><lpage>671</lpage><pub-id pub-id-type="doi">10.1007/s10772-024-10128-2</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Detailed spectrogram normalization equations.</p><media xlink:href="medinform_v14i1e94063_app1.docx" xlink:title="DOCX File, 116 KB"/></supplementary-material></app-group></back></article>