<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e66907</article-id><article-id pub-id-type="doi">10.2196/66907</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Multimodal Multitask Learning for Predicting Depression Severity and Suicide Risk Using Pretrained Audio and Text Embeddings: Methodology Development and Application</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Hu</surname><given-names>Ya-Han</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Ruei-Yan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Su</surname><given-names>Min-Yi</given-names></name><degrees>MAS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Lin</surname><given-names>I-Li</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Shen</surname><given-names>Cheng-Che</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Information Management, National Central University</institution><addr-line>No. 300, Zhongda Rd., Zhongli Dist.</addr-line><addr-line>Taoyuan City</addr-line><country>Taiwan</country></aff><aff id="aff2"><institution>Asian Institute for Impact Measurement and Management, National Central University</institution><addr-line>Taoyuan City</addr-line><country>Taiwan</country></aff><aff id="aff3"><institution>Graduate School of Resources Management and Decision Science, Management College, National Defense University</institution><addr-line>Taipei City</addr-line><country>Taiwan</country></aff><aff id="aff4"><institution>Department of Radiology, Ditmanson Medical Foundation Chia-Yi Christian Hospital</institution><addr-line>Chiayi</addr-line><country>Taiwan</country></aff><aff id="aff5"><institution>Jianan Psychiatric Center</institution><addr-line>No. 539, Yuzhong Rd., Rende Dist.</addr-line><addr-line>Tainan City</addr-line><country>Taiwan</country></aff><aff id="aff6"><institution>School of Medicine, National Yang Ming Chiao Tung University</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Chen</surname><given-names>Qingyu</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Guishen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Hossain Shuvo</surname><given-names>Md Maruf</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>El-Hafeez</surname><given-names>Tarek Abd</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Cheng-Che Shen, MD, PhD, Jianan Psychiatric Center, No. 539, Yuzhong Rd., Rende Dist., Tainan City, 71742, Taiwan, 886 62795019 ext 1537; <email>pures1000@yahoo.com.tw</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>30</day><month>10</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e66907</elocation-id><history><date date-type="received"><day>29</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>02</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Ya-Han Hu, Ruei-Yan Wu, Min-Yi Su, I-Li Lin, Cheng-Che Shen. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 30.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e66907"/><abstract><sec><title>Background</title><p>Depression is a critical psychological disorder necessitating urgent assessment and treatment, given its strong association with increased suicide risk (SR). Effective management hinges on promptly identifying individuals with high depression severity (DS) and SR. While machine learning and deep learning have advanced the identification of DS and SR, research focusing on both aspects simultaneously remains limited and requires further refinement.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate whether our proposed methods, which integrate multitask learning (MTL), multimodal learning, and transfer learning, enhance the efficacy of deep learning models in the joint classification of DS and SR.</p></sec><sec sec-type="methods"><title>Methods</title><p>This study proposed a multitask framework employing a multimodal fusion strategy for pretrained audio and text embeddings to concurrently assess DS and SR. Data encompassing Chinese audio recordings and clinical questionnaire scores from 100 patients with depression and 100 healthy controls were used. Preprocessed audio and text data were transformed into pretrained embeddings and integrated using concatenation and hard parameter sharing. Single-task learning (STL) models (DS and SR tasks) were evaluated with different embeddings and further compared with the MTL models.</p></sec><sec sec-type="results"><title>Results</title><p>The STL models demonstrated exceptional DS prediction (area under the curve [AUC]=0.878) using wav2vec 2.0 combined with ERNIE-health, and SR prediction (AUC=0.876) using HuBERT combined with ERNIE-health. The MTL models significantly improved SR prediction over DS prediction, achieving the highest DS classification (AUC=0.887) with wav2vec 2.0 combined with ERNIE-health, and SR classification (AUC=0.883) with HuBERT combined with ERNIE-health.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The findings of this study underscore the effectiveness of the proposed MTL models using specific pretrained audio and text embeddings in enhancing model performance. However, we advocate for cautious implementation of MTL to mitigate potential negative transfer effects. Our research presents a method that is both promising and effective, offering an objective approach for accurate clinical decision support in the parallel diagnosis of DS and SR.</p></sec></abstract><kwd-group><kwd>depression severity</kwd><kwd>suicide risk</kwd><kwd>multitask learning</kwd><kwd>multimodal learning</kwd><kwd>transfer learning</kwd><kwd>mental health</kwd><kwd>mental illnesses</kwd><kwd>mental disorders</kwd><kwd>depression</kwd><kwd>depressed</kwd><kwd>major depressive disorder</kwd><kwd>MDD</kwd><kwd>depressive disorder</kwd><kwd>machine learning</kwd><kwd>ML</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>algorithms</kwd><kwd>predictive models</kwd><kwd>predictive analytics</kwd><kwd>deep learning</kwd><kwd>early detection</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>In its pervasive embrace, depression, an ever-expanding mental malady, reaches across the globe, leaving its mark on approximately 280 million lives [<xref ref-type="bibr" rid="ref1">1</xref>]. Neglecting proper care of patients with depression can lead to dire consequences, as research has shown that individuals with this condition face a staggering 20-fold higher risk of suicide than the general population [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>], exposing a troubling link between depression and suicide [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>In clinical practice, regular and comprehensive assessments of depression severity (DS) and suicide risk (SR) remain challenging due to time and resource constraints [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Traditional evaluations often rely on subjective and infrequent self-reports from patients or caregivers, which are susceptible to recall bias, cognitive limitations, and social stigma [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. These issues are particularly acute in high-volume settings, where limited consultation time may hinder the timely identification of critical warning signs related to mental health deterioration or suicidal ideation.</p><p>Machine learning has demonstrated strong potential in predicting DS and SR, using text and audio data (eg, [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]). Text-based approaches have extracted clinically meaningful insights from medical narratives [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], while speech analysis has improved predictive accuracy by identifying vocal biomarkers linked to depression and suicide, such as reduced intensity, slower tempo, and increased hesitation [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. These advances have driven the development of multimodal learning (MML) frameworks for mental health prediction. Although effective in detecting depression [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], applications of MML to SR prediction remain limited&#x2014;likely due to the scarcity of high-quality annotated data in this sensitive domain [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Given the frequent co-occurrence of depression and suicide in clinical populations [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>], SR prediction is inherently linked to depression assessment. This conceptual interdependence highlights the potential of multitask learning (MTL) for simultaneously modeling related mental health outcomes. Benton et al [<xref ref-type="bibr" rid="ref25">25</xref>] demonstrated the utility of MTL by jointly predicting SR and other psychiatric conditions using social media data. With the rapid advancement of deep learning, transfer learning (TL) has also emerged as a promising strategy to address data scarcity, with recent studies showing that fine-tuning pretrained models on downstream mental health tasks can significantly enhance predictive performance (eg, [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]).</p><p>However, the current literature reveals several gaps. First, data source diversity remains limited, with most studies relying on datasets from English-speaking populations (eg, [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]). In addition, many analyses are based on social media platforms (eg, [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]) or public datasets (eg, [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]), which often lack clinical relevance. Second, most SR prediction studies have underutilized TL for audio processing, despite its successful application in related domains such as speech emotion recognition [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. Third, although the comorbidity and shared clinical features of DS and SR are well documented [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>], few studies have applied MTL to model these outcomes jointly.</p><p>While recent advances in MML and MTL have shown promise in mental health prediction, few studies have jointly modeled DS and SR using clinically grounded, non-English data. Furthermore, the potential of TL to improve model generalizability across tasks and modalities remains underexplored in Chinese-language clinical contexts. These gaps motivate this study&#x2019;s unified framework, which integrates MML, MTL, and TL to support scalable and efficient mental health screening in real-world clinical settings for Chinese-speaking populations.</p><p>The key contributions of this work are three-fold: (1) the development of the first integrated framework that combines MML, MTL, and TL for the joint prediction of DS and SR in Chinese contexts; (2) empirical validation of MML approaches compared to single modality baselines in a non-English clinical setting; and (3) demonstration of the effectiveness of MTL in modeling related mental health constructs. By addressing linguistic, cultural, and resource-specific challenges, this framework supports scalable and efficient screening in high-volume clinical environments, addressing an urgent need in early mental health assessments and targeted interventions.</p></sec><sec id="s1-2"><title>Related Work</title><p>Research on predictive models in mental health has traditionally adopted single-task approaches, predicting either depression or suicide independently [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref38">38</xref>]. These studies have primarily relied on text, audio, or other features, such as structured electronic health records [<xref ref-type="bibr" rid="ref39">39</xref>] and social media images [<xref ref-type="bibr" rid="ref40">40</xref>], to build predictive models. Recent advancements in text processing technologies have facilitated a shift from conventional hand-crafted features toward sophisticated automated feature learning approaches, exemplified by the heterogeneous graph convolutional network of Wang et al [<xref ref-type="bibr" rid="ref14">14</xref>]. Concurrently, speech-based analyses have gained prominence for their capacity to capture nuanced vocal markers indicative of mental health conditions [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>MTL has emerged as a promising framework for mental health assessment, as summarized in <xref ref-type="table" rid="table1">Table 1</xref>, aligning with the clinical observation that psychiatric conditions often co-occur and share common underlying mechanisms [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]. By jointly learning related tasks, MTL facilitates representation sharing and information transfer, thereby mitigating data sparsity and overfitting issues [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]. Benton et al [<xref ref-type="bibr" rid="ref25">25</xref>] pioneered the use of deep neural networks to simultaneously predict depression and SR using Twitter data.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of key literature on multitask learning for depression severity and suicide risk prediction.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Study</td><td align="left" valign="top">Dataset</td><td align="left" valign="top">Language</td><td align="left" valign="top">Sample</td><td align="left" valign="top" colspan="2">Modality</td><td align="left" valign="top">TL<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top" colspan="2">Task</td><td align="left" valign="top">Method</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">A<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom">T<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="bottom"/><td align="left" valign="bottom">DS<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="bottom">SR<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top">Benton et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">Multiple Twitter datasets</td><td align="left" valign="top">English</td><td align="left" valign="top">9611 users</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">DNN<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Qureshi et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">DAIC-WOZ<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td><td align="left" valign="top">English</td><td align="left" valign="top">189 recordings</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">LSTM<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td></tr><tr><td align="left" valign="top">Ophir et al [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">Facebook posts</td><td align="left" valign="top">English</td><td align="left" valign="top">83,292 postings</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">ANN<sup><xref ref-type="table-fn" rid="table1fn9">i</xref></sup></td></tr><tr><td align="left" valign="top">Qureshi et al [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">DAIC-WOZ, CMU-MOSEI<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup></td><td align="left" valign="top">English</td><td align="left" valign="top">189 recordings</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">LSTM</td></tr><tr><td align="left" valign="top">Dumpala et al [<xref ref-type="bibr" rid="ref44">44</xref>]</td><td align="left" valign="top">FORBOW<sup><xref ref-type="table-fn" rid="table1fn11">k</xref></sup></td><td align="left" valign="top">English</td><td align="left" valign="top">526 recordings</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">CNN<sup><xref ref-type="table-fn" rid="table1fn12">l</xref></sup></td></tr><tr><td align="left" valign="top">Yang et al [<xref ref-type="bibr" rid="ref45">45</xref>]</td><td align="left" valign="top">Chinese micro-blog</td><td align="left" valign="top">Chinese</td><td align="left" valign="top">6100 comments</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">DNN</td></tr><tr><td align="left" valign="top">Ghosh et al [<xref ref-type="bibr" rid="ref46">46</xref>]</td><td align="left" valign="top">CEASE</td><td align="left" valign="top">English</td><td align="left" valign="top">2539 sentences</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">Bi-GRU<sup><xref ref-type="table-fn" rid="table1fn13">m</xref></sup></td></tr><tr><td align="left" valign="top">Buddhitha and Inkpen [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">CLPsych 2015 Twitter, UMD<sup><xref ref-type="table-fn" rid="table1fn14">n</xref></sup>, SMHD<sup><xref ref-type="table-fn" rid="table1fn15">o</xref></sup></td><td align="left" valign="top">English</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn16">p</xref></sup></td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">CNN</td></tr><tr><td align="left" valign="top">Teng et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">AVEC<sup><xref ref-type="table-fn" rid="table1fn17">q</xref></sup> 2019 DDS Challenge Dataset, CMU-MOSEI</td><td align="left" valign="top">English</td><td align="left" valign="top">23,454 video clips and 275 users</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">DNN</td></tr><tr><td align="left" valign="top">Yang et al [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">CEASE</td><td align="left" valign="top">English</td><td align="left" valign="top">2393 sentences</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table1fn18">r</xref></sup></td></tr><tr><td align="left" valign="top">This study</td><td align="left" valign="top">Self-collected</td><td align="left" valign="top">Chinese</td><td align="left" valign="top">200 users</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">DNN</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>TL: transfer learning.</p></fn><fn id="table1fn2"><p><sup>b</sup>A: audio modality.</p></fn><fn id="table1fn3"><p><sup>c</sup>T: text modality.</p></fn><fn id="table1fn4"><p><sup>d</sup>DS: depression severity.</p></fn><fn id="table1fn5"><p><sup>e</sup>SR: suicide risk.</p></fn><fn id="table1fn6"><p><sup>f</sup>DNN: deep neural network.</p></fn><fn id="table1fn7"><p><sup>g</sup>DAIC-WOZ: distress analysis interview corpus-Wizard of Oz.</p></fn><fn id="table1fn8"><p><sup>h</sup>LSTM: long short-term memory.</p></fn><fn id="table1fn9"><p><sup>i</sup>ANN: artificial neural network.</p></fn><fn id="table1fn10"><p><sup>j</sup>CMU-MOSEI: CMU multimodal opinion sentiment and emotion intensity.</p></fn><fn id="table1fn11"><p><sup>k</sup>FORBOW: families overcoming risks and building opportunities for wellbeing.</p></fn><fn id="table1fn12"><p><sup>l</sup>CNN: convolutional neural network.</p></fn><fn id="table1fn13"><p><sup>m</sup>Bi-GRU: bidirectional GRU.</p></fn><fn id="table1fn14"><p><sup>n</sup>UMD: University of Maryland Reddit suicidality dataset.</p></fn><fn id="table1fn15"><p><sup>o</sup>SMHD: self-reported mental health diagnoses dataset.</p></fn><fn id="table1fn16"><p><sup>p</sup>Not applicable.</p></fn><fn id="table1fn17"><p><sup>q</sup>AVEC: audio/visual emotion challenge.</p></fn><fn id="table1fn18"><p><sup>r</sup>BERT: bidirectional encoder representations from transformers.</p></fn></table-wrap-foot></table-wrap><p>Several studies listed in <xref ref-type="table" rid="table1">Table 1</xref> have incorporated MML to improve predictive performance. By integrating diverse data types, MML leverages complementary information to enable a more comprehensive characterization of mental states. Qureshi et al [<xref ref-type="bibr" rid="ref31">31</xref>], for example, demonstrated enhanced depression prediction accuracy using long short-term memory models trained on combined textual and acoustic features from the DAIC-WOZ (distress analysis interview corpus-Wizard of Oz) dataset. Additionally, TL has also been increasingly adopted in these frameworks to address the challenge of limited labeled data. Teng et al [<xref ref-type="bibr" rid="ref26">26</xref>] applied depression detection with sentiment assistance through deep neural networks and TL techniques on the AVEC (audio/visual emotion challenge) 2019 DDS Challenge and CMU-MOSEI (CMU multimodal opinion sentiment and emotion intensity) datasets. Similarly, Yang et al [<xref ref-type="bibr" rid="ref27">27</xref>] used MTL with a BERT-based model to incorporate time-perspective cues for suicidal ideation detection on the CEASE dataset.</p><p>Despite these advances, key limitations persist. First, most studies rely on English-language data. Furthermore, text-based models are often trained on social media content [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref45">45</xref>], while audio models rely on public datasets [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref46">46</xref>] that may lack relevance to real-world clinical scenarios, thereby potentially limiting their applicability. Second, most SR prediction models are still trained from scratch, with only a few studies (eg, [<xref ref-type="bibr" rid="ref27">27</xref>]) leveraging TL to enhance model performance. Most critically, empirical research exploring MTL&#x2019;s effectiveness for simultaneously predicting both DS and SR remains scarce. To our knowledge, only Benton et al [<xref ref-type="bibr" rid="ref25">25</xref>] have conducted similar research, though their work was conducted exclusively in English on social media data.</p><p>To address these gaps, this study introduces a unified MML, MTL, and TL framework for the simultaneous prediction of DS and SR using Chinese-language data collected in clinical settings. This approach facilitates the development of culturally and linguistically tailored predictive models for Chinese-speaking populations. Moreover, by incorporating TL, the proposed framework retains knowledge acquired from source tasks, enabling efficient adaptation to downstream applications.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study received approval from the Institutional Review Board of Taichung Veterans General Hospital (approval number: SE21183B).</p><p>Every participant was required to complete and sign a participant consent form before their involvement. This form outlined the purpose and procedures of the study, potential risks and benefits, confidentiality measures, and voluntary participation rights. The completion of this form indicated their informed and voluntary consent to partake in the study. In the section of the participant consent form dedicated to &#x201C;consent to participate,&#x201D; participants were explicitly informed about the inclusion of a clause seeking their agreement to employ their personal data, information, or research outcomes for publication purposes. By completing and signing the participant consent form, participants signified their understanding and acceptance of the terms outlined, thereby granting their &#x201C;consent for publication.&#x201D; This agreement encompassed the use of their anonymized data and contributions in academic papers, reports, presentations, or other forms of scholarly dissemination.</p></sec><sec id="s2-2"><title>Study Population</title><p>We collected a Chinese chief complaint dataset, which includes data from 100 patients with depression from a regional hospital in southern Taiwan, along with 100 age- and sex-matched nondepressed counterparts selected at random, resulting in a total of 200 cases. To verify the matching process, we conducted tests. The chi-square test for gender in relation to the prevalence of the condition was not significant (<italic>P</italic>=.88). Similarly, the <italic>t</italic> test for age in relation to the prevalence of the condition was not significant (<italic>P</italic>=.60).</p><p>Each case in the dataset includes personal data, an audio recording describing the current situation, transcripts, and clinical questionnaire results. The audio recordings were acquired by instructing participants as follows: &#x201C;Please take a minute to elucidate your recent emotions, life circumstances, and other states.&#x201D; Subsequently, participants initiated the recording of their spoken expressions. Based on the questionnaire results, we conducted 2 specific clinical assessments: Hamilton Depression Rating Scale-17 (HAMD-17) [<xref ref-type="bibr" rid="ref47">47</xref>] and SAD PERSONS scale [<xref ref-type="bibr" rid="ref48">48</xref>]. DS was categorized into 3 levels: no depression (HAMD-17 score of 0&#x2010;7; sample size of 106), low/moderate depression (HAMD-17 score of 8-16/17-23; sample size of 21), and high depression (HAMD-17 score of &#x2265;24; sample size of 73). SR was classified into 2 levels: low risk (SAD PERSONS score of 0&#x2010;3; sample size of 110) and moderate/high risk (SAD PERSONS score of 4-7/8-10; sample size of 90).</p><p>The demographic data for both groups can be found in <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref>. In the 3 DS groups, there were statistically significant differences between the 2 study groups regarding age (<italic>P</italic>=.048), educational level (<italic>P</italic>&#x003C;.001), occupation (<italic>P</italic>=.01), and marriage (<italic>P</italic>=.001). In terms of educational level, the HAMD-17&#x2264;7 group exhibited higher levels compared to the 8&#x003C;HAMD-17&#x2264;23 and HAMD-17&#x2265;24 groups, and the proportion of individuals employed was also higher in the HAMD-17&#x2264;7 group than in the 8&#x003C;HAMD-17&#x2264;23 and HAMD-17&#x2265;24 groups (64/97, 66% vs 13/30, 43% and 33/73, 45%). In terms of marital status, the HAMD-17&#x2264;7 group had a higher proportion of married individuals and a lower proportion of divorced individuals. In the 2 SR groups, there were statistically significant differences between the 2 study groups regarding educational level (<italic>P</italic>&#x003C;.001), occupation (<italic>P</italic>=.02), and marriage (<italic>P</italic>&#x003C;.001). In terms of educational level, the SAD PERSONS&#x2264;3 group exhibited higher levels compared to the SAD PERSONS&#x2265;4 group. In terms of occupation, the proportion of individuals was also higher in the SAD PERSONS&#x2264;3 group than in the SAD PERSONS&#x2265;4 group (69/110, 62.7% vs 41/90, 45.6%). In terms of marital status, the SAD PERSONS&#x2265;4 group had a higher proportion of unmarried individuals.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Demographic data of patients in the 3 depression severity groups.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable</td><td align="left" valign="bottom">HAMD-17<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>&#x2264;7 group (n=97)</td><td align="left" valign="bottom">8&#x003C;HAMD-17&#x2264;23 group (n=30)</td><td align="left" valign="bottom">HAMD-17&#x2265;24 group (n=73)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Sex, n (%)</td><td align="left" valign="top">.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">29 (30)</td><td align="left" valign="top">9 (30)</td><td align="left" valign="top">21 (29)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">68 (70)</td><td align="left" valign="top">21 (70)</td><td align="left" valign="top">52 (71)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Age (years), mean (SD)</td><td align="left" valign="top">44 (17)</td><td align="left" valign="top">38 (19)</td><td align="left" valign="top">47 (18)</td><td align="left" valign="top">.048<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top" colspan="4">Education level, n (%)</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Elementary school</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">2 (7)</td><td align="left" valign="top">6 (8)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Junior high school</td><td align="left" valign="top">2 (2)</td><td align="left" valign="top">2 (7)</td><td align="left" valign="top">8 (11)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior high school</td><td align="left" valign="top">12 (12)</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">29 (40)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>College degree or higher</td><td align="left" valign="top">83 (86)</td><td align="left" valign="top">20 (67)</td><td align="left" valign="top">30 (41)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="4">Occupation, n (%)</td><td align="left" valign="top">.01<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">64 (66)</td><td align="left" valign="top">13 (43)</td><td align="left" valign="top">33 (45)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">33 (34)</td><td align="left" valign="top">17 (57)</td><td align="left" valign="top">40 (55)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="4">Marriage, n (%)</td><td align="left" valign="top">.001<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unmarried</td><td align="left" valign="top">36 (37)</td><td align="left" valign="top">20 (67)</td><td align="left" valign="top">27 (37)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Married</td><td align="left" valign="top">61 (63)</td><td align="left" valign="top">9 (30)</td><td align="left" valign="top">39 (53)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Divorced</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (3)</td><td align="left" valign="top">7 (10)</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>HAMD-17: Hamilton Depression Rating Scale-17.</p></fn><fn id="table2fn2"><p><sup>b</sup>Statistical significance.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Demographic data of patients in the 2 suicide risk groups.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Variable</td><td align="left" valign="top">SAD PERSONS&#x2264;3 group (n=110)</td><td align="left" valign="top">SAD PERSONS&#x2265;4 group (n=90)</td><td align="left" valign="top"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Sex, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.09</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">27 (24.5)</td><td align="left" valign="top">32 (35.6)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">83 (75.5)</td><td align="left" valign="top">58 (64.4)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Age (years), mean (SD)</td><td align="left" valign="top">45 (16.7)</td><td align="left" valign="top">42 (19.3)</td><td align="left" valign="top">.26</td></tr><tr><td align="left" valign="top">Education level, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Elementary school</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">8 (8.9)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Junior high school</td><td align="left" valign="top">4 (3.6)</td><td align="left" valign="top">8 (8.9)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior high school</td><td align="left" valign="top">17 (15.5)</td><td align="left" valign="top">30 (33.3)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>College degree or higher</td><td align="left" valign="top">89 (80.9)</td><td align="left" valign="top">44 (48.9)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Occupation, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">69 (62.7)</td><td align="left" valign="top">41 (45.6)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">41 (37.3)</td><td align="left" valign="top">49 (54.4)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Marriage, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unmarried</td><td align="left" valign="top">34 (30.9)</td><td align="left" valign="top">49 (54.4)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Married</td><td align="left" valign="top">75 (68.2)</td><td align="left" valign="top">34 (37.8)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Divorced</td><td align="left" valign="top">1 (0.9)</td><td align="left" valign="top">7 (7.8)</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Statistical significance.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Proposed Framework</title><p>The framework comprises 3 components: feature extraction, multimodal fusion, and MTL architecture, which are discussed sequentially in the following sections (<xref ref-type="fig" rid="figure1">Figure 1</xref>). First, audio and text data undergo processing by pretrained models to extract their embeddings. Second, the embeddings obtained from the previous step are fused using a modality fusion layer. The resulting fused representations are then fed into a fully connected (FC) network to project them into lower-dimensional vectors. Lastly, these representations are shared between the 2 classification tasks (DS and SR) and are input into 2 task-specific layers implemented as multilayer perceptron classifiers to generate output probabilities separately. The details of the 3 components are presented below.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of our proposed framework. FC: fully connected.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66907_fig01.png"/></fig><p>First, considering the small sample size in our study and recognizing the potential of TL in predicting DS and SR, we used 4 advanced pretrained models for feature extraction: wav2vec 2.0 and HuBERT for audio analysis [<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>], and Longformer and ERNIE-health for text analysis [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]. Each method has distinct advantages, rendering them especially suitable for our research objectives, as elaborated upon in the following sections.</p><list list-type="bullet"><list-item><p>wav2vec 2.0: It is developed by Facebook AI Research, uses a multilayer convolutional neural network (CNN) for audio encoding, and is supplemented by latent representation masking and contextualization through a Transformer network trained with contrastive learning methods [<xref ref-type="bibr" rid="ref49">49</xref>]. This self-supervised model excels with minimal labeled data, consistently surpassing state-of-the-art models, as demonstrated in the tasks of depression detection [<xref ref-type="bibr" rid="ref52">52</xref>] and emotion recognition [<xref ref-type="bibr" rid="ref53">53</xref>].</p></list-item></list><list list-type="bullet"><list-item><p>HuBERT: It extends self-supervised learning to audio data, using a CNN for encoding and a BERT encoder for contextualization, enhanced by masked prediction and cluster refinement [<xref ref-type="bibr" rid="ref50">50</xref>]. HuBERT has demonstrated superior performance in audio classification tasks for detecting depression [<xref ref-type="bibr" rid="ref54">54</xref>] and assessing cognitive function [<xref ref-type="bibr" rid="ref55">55</xref>].</p></list-item></list><list list-type="bullet"><list-item><p>Longformer: It stands out as a transformer-based language model designed to capture extended dependencies using sliding window and global attention mechanisms [<xref ref-type="bibr" rid="ref51">51</xref>]. This design enables Longformer to effectively integrate local and global information while mitigating challenges associated with traditional attention mechanisms.</p></list-item></list><list list-type="bullet"><list-item><p>ERNIE-health: It is a Chinese biomedical language model tailored for biomedical text processing, enhancing tokenization and comprehension of biomedical content through in-domain text [<xref ref-type="bibr" rid="ref56">56</xref>]. ERNIE-health consistently outperforms other models across various biomedical tasks [<xref ref-type="bibr" rid="ref57">57</xref>], underscoring its effectiveness in this domain.</p></list-item></list><p>Second, these pretrained models were used to generate audio embeddings, <inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and text embeddings, <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, by feeding the preprocessed audio recordings and transcripts as their inputs. To combine the information from text and audio modalities, we adopted the early fusion approach by concatenating the audio embedding (<inline-formula><mml:math id="ieqn3"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>) and text embedding (<inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>) into a single vector (<inline-formula><mml:math id="ieqn5"><mml:mi>x</mml:mi></mml:math></inline-formula>), using Eq. (1). This fusion strategy, also known as feature-level fusion, is characterized by its simplicity, its computational efficiency, and the potential to capture intricate interactive details. We adopted this approach due to its aforementioned advantages and its ability to circumvent the risk of information overlap or cancellation inherent in more complex operations such as addition, deduction, inner product, and outer product. This approach has been widely used in prior studies on audio-text fusion, consistently yielding improved accuracy [<xref ref-type="bibr" rid="ref58">58</xref>-<xref ref-type="bibr" rid="ref60">60</xref>].</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mi>x</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2295;</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula><p>Lastly, our proposed framework adopted the hard parameter sharing scheme for MTL of DS and SR classification using deep learning. This scheme involves a shared encoder with multiple task-specific decoding heads [<xref ref-type="bibr" rid="ref46">46</xref>]. This MTL technique enables our framework to learn multiple related tasks simultaneously while improving the generalization performance. In our framework, an FC network acts as the shared encoder, and dense layers act as the task-specific heads. The FC network <inline-formula><mml:math id="ieqn6"><mml:mi>f</mml:mi></mml:math></inline-formula> learns a condensed representation <inline-formula><mml:math id="ieqn7"><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">`</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> from the fused input <inline-formula><mml:math id="ieqn8"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>x</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>, as shown in Eq. (2). Subsequently, a softmax function is applied to 2 task-specific dense layers, <inline-formula><mml:math id="ieqn9"><mml:msub><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn10"><mml:msub><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, to transform <inline-formula><mml:math id="ieqn11"><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">`</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> into output probabilities for DS classification (Eq. (3)) and SR classification (Eq. (4)), respectively.</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:mfenced></mml:math></disp-formula><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>In the context of an MTL model, the design of loss functions for multiple objectives is crucial. Instead of using weighted sum of loss functions, which can be influenced by weights and time-consuming to determine, we adopted the automatic weighted loss approach introduced by [<xref ref-type="bibr" rid="ref47">47</xref>]. This method considers the homoscedastic uncertainty of each task and derives appropriate weights based on task uncertainties. Tasks with higher uncertainties are assigned lower weights, allowing the model to effectively learn across tasks in a more balanced manner.</p><p>For the loss calculation, our proposed method involves a 2-stage approach. In the first stage, we compute task-specific losses, <inline-formula><mml:math id="ieqn12"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn13"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, for DS and SR tasks, respectively, using cross entropy, as described in Eq. (5) (<inline-formula><mml:math id="ieqn14"><mml:mi>C</mml:mi></mml:math></inline-formula> represents the number of labels in the corresponding task). In the second stage, the total loss, <inline-formula><mml:math id="ieqn15"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, is determined using the automatic weighted loss method proposed by [<xref ref-type="bibr" rid="ref47">47</xref>], as depicted in Eq. (6). The goal is to minimize the total loss, enhancing the model&#x2019;s performance in DS and SR classification tasks, which can enable effective learning from the data and accurate predictions for both tasks.</p><disp-formula id="E5"><label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E6"><label>(6)</label><mml:math id="eqn6"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x03C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x03C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="normal">log</mml:mi></mml:mrow><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mfenced separators="|"><mml:mrow><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x03C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfenced></mml:mrow></mml:mrow><mml:mo>+</mml:mo><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">g</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo>(</mml:mo><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x03C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:math></disp-formula></sec><sec id="s2-4"><title>Implementation Details</title><p>We implemented our approach using PyTorch [<xref ref-type="bibr" rid="ref61">61</xref>] and the Transformers library from Hugging Face [<xref ref-type="bibr" rid="ref62">62</xref>]. Pretrained models were loaded by specifying the model version string in the application programming interface. Refer to Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for details of the Chinese versions of the 4 models selected for this study.</p><p>To extract features from the audio and text modalities, we configured several parameters. The audio features were generated with a sampling rate of 16,000 and a duration of 6.25 seconds, resulting in a 100,000-dimensional feature. For the text modality, transcripts were tokenized into a fixed length of 512 tokens, with truncation or padding applied if necessary. The audio features were then transformed into 1024-dimensional embeddings, while the tokenized text inputs were represented as 768-dimensional embeddings.</p><p>To prevent overfitting during training, batch normalization and rectified linear unit activation were applied to linear layers that did not act as classifiers. A batch size of 8 was used, and the models were trained for 20 epochs with an early stopping patience of 3. Cross-entropy was used to calculate the loss for single-task learning (STL), while automatic weighted loss was used for MTL. The AdamW optimizer was used for optimizing the losses. The parameter details for each model, including the modality used (single or multiple) and the learning architecture adopted (single task or multitask), are presented in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Parameter settings.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameter</td><td align="left" valign="bottom">SMSTL<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom">MMSTL<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="bottom">SMMTL<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="bottom">MMMTL<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Epochs</td><td align="left" valign="top">20</td><td align="left" valign="top">20</td><td align="left" valign="top">20</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">Early stopping patience</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Batch size</td><td align="left" valign="top">8</td><td align="left" valign="top">8</td><td align="left" valign="top">8</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">Learning rate</td><td align="left" valign="top">0.0005</td><td align="left" valign="top">0.0005</td><td align="left" valign="top">0.0005</td><td align="left" valign="top">0.0005</td></tr><tr><td align="left" valign="top">Warmup ratio</td><td align="left" valign="top">0.3</td><td align="left" valign="top">0.35</td><td align="left" valign="top">0.35</td><td align="left" valign="top">0.25</td></tr><tr><td align="left" valign="top">Dropout probability</td><td align="left" valign="top">0.2</td><td align="left" valign="top">0.1</td><td align="left" valign="top">0.1</td><td align="left" valign="top">0.1</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>SMSTL: single modality with single-task learning.</p></fn><fn id="table4fn2"><p><sup>b</sup>MMSTL: multimodal with single-task learning.</p></fn><fn id="table4fn3"><p><sup>c</sup>SMMTL: single modality with multitask learning.</p></fn><fn id="table4fn4"><p><sup>d</sup>MMMTL: multimodal with multitask learning.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-5"><title>Experimental Evaluation</title><p>Our proposed framework is built using the 3 data types in the dataset: audio recordings, transcripts, and questionnaire results, as shown in the flow diagram in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><p>During preprocessing, the audio data underwent 3 steps: removal of file-edge silence, denoising using Podcastle [<xref ref-type="bibr" rid="ref63">63</xref>], and feature extraction. We used Podcastle&#x2019;s Magic Dust AI technology for its advanced denoising capabilities, which integrate spectral filtering, adaptive noise cancellation, and machine learning algorithms [<xref ref-type="bibr" rid="ref64">64</xref>]. Specifically, we used the &#x201C;noise reduction&#x201D; mode to automatically detect and suppress nonstationary background noises, such as coughs, sniffles, and microphone taps, while preserving speech clarity and signal integrity [<xref ref-type="bibr" rid="ref65">65</xref>]. This step minimized noise-related distortions prior to feature extraction and analysis.</p><p>Feature extraction was then applied to both audio and text data using pretrained models, yielding their respective embeddings as described earlier. The processed dataset was partitioned into 10 subsets for cross-validation, with 1 subset used for testing and the remaining 9 for training in each fold. Final performance metrics were averaged across all 10 trials. In parallel, questionnaire responses were one-hot encoded to represent discrete class labels, serving as the output variables for prediction.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Flow diagram. MLP: multilayer perceptron; MTL: multitask learning; STL: single-task learning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66907_fig02.png"/></fig><p>Our study consisted of 3 experiments. In experiments 1 and 2, we built STL models for DS and SR tasks, using different combinations of embeddings. The primary aim was to identify the best pretrained models for extracting text and audio embeddings in each task and assess the advantages of using multimodal data compared to unimodal data for each task. In experiment 3, we developed several MTL models with hard parameter sharing to combine information from both tasks. The performance of these MTL models was then compared to the STL models from experiments 1 and 2, providing insights into the potential benefits of MTL for the 2 tasks.</p></sec><sec id="s2-6"><title>Performance Measure</title><p>To assess the effectiveness of our classification models, we used a range of standard metrics, including accuracy, recall, precision, specificity, <italic>F</italic><sub>1</sub>-score, and area under the curve (AUC). These metrics were derived from the confusion matrix, with AUC serving as the primary metric for comprehensive performance evaluation. In cases where the difference in AUC between models was not significant, we also considered other metrics, such as accuracy, <italic>F</italic><sub>1</sub>-score, and recall, to ensure a thorough assessment of model performance.</p><p>In the SR prediction task, the positive class (eg, &#x201C;at risk&#x201D;) encompassed individuals with a moderate or high risk of suicide, as detailed earlier. In contrast, for the DS prediction task, the models&#x2019; performance across all classes (eg, none, low/moderate, and high) was evaluated using the macro-average approach, rather than focusing solely on a specific positive class.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Experiment 1: STL Models for DS Prediction</title><p>In experiment 1, we aimed to find the best STL model for DS classification by using various pretrained embeddings to differentiate between the 3 severity levels. These models employed a multilayer perceptron classifier for classification and were categorized into audio-only, text-only, and combined audio and text modalities based on the embeddings used. The classification performance of these DS prediction models on each metric is presented in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Performance comparison of single-task learning models for depression severity (DS) and suicide risk (SR) prediction. Performance metrics are presented as a heatmap, where color gradients reflect the relative magnitude of values, ranging from red (lower values) to green (higher values). A: audio only; A+T: combined audio and text; ACC: accuracy; AUC: area under the curve; E: ERNIE-health; <italic>F</italic><sub>1</sub>: <italic>F</italic><sub>1</sub>-score; H: HuBERT; L: Longformer; P: precision; R: recall; S: specificity; T: text only; W: wav2vec 2.0.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66907_fig03.png"/></fig><p>Regarding DS prediction models, we obtained several findings (<xref ref-type="fig" rid="figure3">Figure 3</xref>). First, the results demonstrated that most embeddings, except those of the audio modality, performed well in terms of AUC, with scores exceeding 0.8. In the audio modality, the wav2vec 2.0 embedding outperformed the HuBERT embedding. In the text modality, the ERNIE-health embedding demonstrated superior performance in terms of AUC (0.877), accuracy (0.780), recall (0.609), and specificity (0.868), indicating its effectiveness in capturing specific aspects of DS in textual data. Second, combining embeddings from different modalities led to improvements across all metrics for most embeddings. Notably, the addition of the ERNIE-health embedding to the HuBERT embedding resulted in a substantial performance boost, with an 11.5% increase in AUC and up to 27.79% improvement in precision. Third, our comprehensive evaluation of multiple metrics showed that the multimodal models outperformed the single-modality models, except for the combination that included the HuBERT embedding, which may impair the ability of text embeddings. Lastly, among all the embeddings analyzed, the wav2vec 2.0+ERNIE-health and wav2vec 2.0+Longformer embeddings achieved the highest AUC scores of 0.878 and 0.873, respectively.</p></sec><sec id="s3-2"><title>Experiment 2: STL Models for SR Prediction</title><p>In experiment 2, our objective was to identify the best STL model for SR classification by using different pretrained embeddings. Similar to experiment 1, multilayer perceptron classifiers were used to analyze the embeddings from different modalities. The evaluation results of these embeddings for SR classification are presented in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><p>Based on the data presented in <xref ref-type="fig" rid="figure3">Figure 3</xref>, regarding SR prediction models, several findings were obtained. First, the results demonstrated that most embeddings achieved AUC values greater than 0.8, except for audio modality embeddings and the Longformer embedding. In the audio modality, the HuBERT embedding outperformed the wav2vec 2.0 embedding on most metrics, except for precision and specificity. This suggests that the HuBERT embedding may be a better choice for overall SR classification, while the wav2vec 2.0 embedding may be more effective in correctly identifying individuals who are not at risk of suicide. In the text modality, the ERNIE-health embedding outperformed the Longformer embedding, obtaining higher values on all metrics, indicating that the ERNIE-health embedding is more effective for SR classification. Second, combining embeddings from different modalities consistently improved AUC, precision, and specificity. Specifically, incorporating multimodal embeddings led to significant performance improvements, with increased AUC (2.28% to 15.60%), precision (3.32% to 20.81%), and specificity (2.73% to 10.19%) across all single-modality models, indicating improved accuracy in identifying nonrisk individuals. Third, the HuBERT+ERNIE-health embedding achieved the highest performance in terms of AUC (0.876) among all embeddings.</p></sec><sec id="s3-3"><title>Experiment 3: MTL Models for DS and SR Predictions</title><p>In experiment 3, we aimed to explore the potential of MTL models in improving DS and SR predictions by leveraging shared information between the 2 tasks. <xref ref-type="fig" rid="figure4">Figure 4</xref> provides a comprehensive summary of the performance metrics, and the subsequent content further discusses the results of experiments 1 and 2 for comparison.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Performance comparison of multitask learning models for depression severity (DS) and suicide risk (SR) prediction. Performance metrics are presented as a heatmap, where color gradients reflect the relative magnitude of values, ranging from red (lower values) to green (higher values). A: audio only; A+T: combined audio and text; ACC: accuracy; AUC: area under the curve; E: ERNIE-health; <italic>F</italic><sub>1</sub>: <italic>F</italic><sub>1</sub>-score; H: HuBERT; L: Longformer; P: precision; R: recall; S: specificity; T: text only; W: wav2vec 2.0.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66907_fig04.png"/></fig><p>From <xref ref-type="fig" rid="figure3">Figures 3</xref> and <xref ref-type="fig" rid="figure4">4</xref>, we found that all models, except the ones using the wav2vec 2.0+Longformer embedding, demonstrated an increase in AUC ranging from 0.25% to 3.88% with MTL, indicating the potential of MTL in enhancing performance for DS. Additionally, we observed that when adopting MTL for SR prediction, all models, except for the wav2vec 2.0+ERNIE-health and HuBERT+Longformer embeddings, demonstrated an increase in AUC ranging from 0.96% to 10.18%. On the other hand, what stands out is that when applying the MTL framework, there was a consistent enhancement in accuracy, <italic>F</italic><sub>1</sub>-score, and recall among the combined audio and text models, including the aforementioned 2 models using the wav2vec 2.0+ERNIE-health and HuBERT+Longformer embeddings. These findings suggest that combined audio and text embeddings are well-suited for the MTL approach, although they may increase false positives while better identifying individuals at risk for suicide.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study proposes a multitask framework that integrates a multimodal fusion strategy using pretrained audio and text embeddings to concurrently assess DS and SR. The efficacy of the proposed method has been validated using real-world clinical data.</p><p>Some of the significant findings of this study are as follows. First, we introduced and investigated renowned pretrained models for their effectiveness in audio and text classification tasks. The findings demonstrated that the ERNIE-health text modality embedding, specifically trained on a medical corpus, consistently outperformed the Longformer text modality embedding in both STL models (for DS prediction and SR prediction) and MTL models. On the other hand, the wav2vec 2.0 audio modality embedding performed better than the HuBERT embedding in STL models for DS prediction and MTL models for both tasks, but performed worse than the HuBERT embedding in STL models for SR prediction.</p><p>Second, our results underscore the effectiveness of multimodal approaches over single-modality ones in classifying DS and SR in the majority of cases. Even straightforward fusion techniques, such as concatenation, improve performance by integrating richer information, consistent with previous research [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref66">66</xref>]. This implies that the combination of audio and text embeddings provides a more comprehensive representation of the underlying phenomena than using each modality independently.</p><p>Third, the results indicated that the performance of text modality models significantly surpassed that of audio modality models, except in MTL models using the Longformer embedding for SR prediction. Several potential explanations can be considered for this observation. Despite preprocessing efforts to reduce noise, the audio modality model remains susceptible to variations in speaker accents or weaker emotional expressiveness [<xref ref-type="bibr" rid="ref67">67</xref>], which can adversely affect the model&#x2019;s performance. In contrast, text data are not influenced by such variations. Additionally, techniques for processing and embedding text data are highly advanced, such as ERNIE-health, which can contribute to the superior performance of most text modality models. This demonstrates that ERNIE-health can effectively bridge the gap between pretraining goals and downstream tasks [<xref ref-type="bibr" rid="ref56">56</xref>]. Conversely, processing and feature extraction for audio data in our dataset may not be as efficient as for text embeddings. Furthermore, research indicates that suicidal tendencies and depressive symptoms are explicitly conveyed through syntactic and semantic patterns in text, which are efficiently captured by text embeddings [<xref ref-type="bibr" rid="ref68">68</xref>]. On the contrary, extracting and interpreting these signals from audio data are inherently more complex and less robust.</p><p>Fourth, our findings demonstrated that the proposed MTL framework, using specific pretrained audio and text embeddings, significantly enhanced the classification performance for DS and SR. Considering the common class imbalance in clinical datasets, we selected AUC as our primary evaluation metric to provide a reliable and clinically meaningful representation of model performance [<xref ref-type="bibr" rid="ref69">69</xref>,<xref ref-type="bibr" rid="ref70">70</xref>]. However, our experiments also revealed that not all models benefited from its use, with 3 multimodal models showing no improvement in AUC (<xref ref-type="table" rid="table5">Table 5</xref>). Interestingly, MTL improved the performance of all single-modality models on both tasks. However, among the 4 multimodal models, only 1 (ie, using the HuBERT+ERNIE-health embeddings) exhibited improvement in AUC for both tasks when using MTL. The remaining 3 models demonstrated mixed results, with 1 task showing improvement, and the others experiencing a drop in performance. This highlights the phenomenon of negative transfer [<xref ref-type="bibr" rid="ref71">71</xref>], suggesting that transferred knowledge may not always have a positive impact on other tasks, even if they share similarities [<xref ref-type="bibr" rid="ref71">71</xref>].</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Performance comparison of STL<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> and MTL<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> models for depression severity and suicide risk prediction.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" colspan="3">Task, modality, and embedding</td><td align="left" valign="top" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">Improvement</td></tr><tr><td align="left" valign="bottom" colspan="3"/><td align="left" valign="top">STL</td><td align="left" valign="top">MTL</td><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="6">Depression severity prediction</td></tr><tr><td align="left" valign="top" colspan="6"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Audio only</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>wav2vec</td><td align="left" valign="top">0.791</td><td align="left" valign="top">0.793</td><td align="left" valign="top">+0.002</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>HuBERT</td><td align="left" valign="top">0.765</td><td align="left" valign="top">0.771</td><td align="left" valign="top">+0.006</td></tr><tr><td align="left" valign="top" colspan="6"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Text only</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Longformer</td><td align="left" valign="top">0.802</td><td align="left" valign="top">0.810</td><td align="left" valign="top">+0.008</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE-health</td><td align="left" valign="top">0.877</td><td align="left" valign="top">0.885</td><td align="left" valign="top">+0.008</td></tr><tr><td align="left" valign="top" colspan="6"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Combination of audio and text</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>wav2vec+ERNIE-health</td><td align="left" valign="top">0.878</td><td align="left" valign="top">0.912<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">+0.034</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>wav2vec+Longformer</td><td align="left" valign="top">0.873</td><td align="left" valign="top">0.866</td><td align="left" valign="top">&#x2212;0.007</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>HuBERT+ERNIE-health</td><td align="left" valign="top">0.853</td><td align="left" valign="top">0.866</td><td align="left" valign="top">+0.013</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>HuBERT+Longformer</td><td align="left" valign="top">0.820</td><td align="left" valign="top">0.844</td><td align="left" valign="top">+0.024</td></tr><tr><td align="left" valign="top" colspan="6">Suicide risk prediction</td></tr><tr><td align="left" valign="top" colspan="6"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Audio only</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>wav2vec</td><td align="left" valign="top">0.737</td><td align="left" valign="top">0.812</td><td align="left" valign="top">+0.075</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>HuBERT</td><td align="left" valign="top">0.762</td><td align="left" valign="top">0.803</td><td align="left" valign="top">+0.041</td></tr><tr><td align="left" valign="top" colspan="6"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Text only</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Longformer</td><td align="left" valign="top">0.784</td><td align="left" valign="top">0.799</td><td align="left" valign="top">+0.015</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE-health</td><td align="left" valign="top">0.833</td><td align="left" valign="top">0.861</td><td align="left" valign="top">+0.028</td></tr><tr><td align="left" valign="top" colspan="6"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Combination of audio and text</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>wav2vec+ERNIE-health</td><td align="left" valign="top">0.852</td><td align="left" valign="top">0.829</td><td align="left" valign="top">&#x2212;0.023</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>wav2vec+Longformer</td><td align="left" valign="top">0.838</td><td align="left" valign="top">0.846</td><td align="left" valign="top">+0.008</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>HuBERT+ERNIE-health</td><td align="left" valign="top">0.876</td><td align="left" valign="top">0.901<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">+0.025</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>HuBERT+Longformer</td><td align="left" valign="top">0.822</td><td align="left" valign="top">0.821</td><td align="left" valign="top">&#x2212;0.001</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>STL: single-task learning.</p></fn><fn id="table5fn2"><p><sup>b</sup>MTL: multitask learning.</p></fn><fn id="table5fn3"><p><sup>c</sup>AUC: area under the curve.</p></fn><fn id="table5fn4"><p><sup>d</sup>Highest AUC values for each task.</p></fn></table-wrap-foot></table-wrap><p>Finally, our study revealed that MTL models led to more substantial improvements in the SR prediction task compared to the DS prediction task, with all multimodal MTL models demonstrating higher recall than their STL counterparts in predicting SR. These findings may be attributed to several aspects. On one hand, the SR prediction task may involve information or patterns different from those in the DS prediction task. For instance, text modalities might convey clearer linguistic patterns, such as specific word choices, pronoun usage, and negative terms [<xref ref-type="bibr" rid="ref17">17</xref>], which could be more predictive of SR than DS. However, MTL allows models to share learned representations across tasks. If the features relevant to the SR prediction task benefit from certain text or audio modality representations, these features may also aid the DS task, even if the latter shows less improvement. On the other hand, the prediction of DS may be more influenced by sample variability [<xref ref-type="bibr" rid="ref72">72</xref>,<xref ref-type="bibr" rid="ref73">73</xref>], whereas the prediction of SR might exhibit stronger commonalities across samples. These findings further underscore the value of MTL, as it enables the model to address such differences through shared representations, thereby enhancing prediction accuracy.</p><p>To contextualize our work within current state-of-the-art techniques, we compared our multitask framework with recent studies on depression and suicide prediction, as summarized in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Our proposed MTL model, which integrates audio and text modalities with pretrained embeddings, achieved competitive performance (DS: AUC=0.91; accuracy=0.81; <italic>F</italic><sub>1</sub>-score=0.69 with wav2vec 2.0+ERNIE-health; SR: AUC=0.90; accuracy=0.78; <italic>F</italic><sub>1</sub>-score=0.77 with HuBERT+ERNIE-health), outperforming several prominent MTL models. These include models by Benton et al [<xref ref-type="bibr" rid="ref25">25</xref>] (depression: AUC=0.77; suicide: AUC=0.83), Ghosh et al [<xref ref-type="bibr" rid="ref46">46</xref>] (depression: accuracy=0.74), and Yang et al [<xref ref-type="bibr" rid="ref27">27</xref>] (suicide: accuracy=0.74). While Buddhitha and Inkpen [<xref ref-type="bibr" rid="ref29">29</xref>] reported slightly higher performance for suicide prediction (AUC=0.88; accuracy=0.84), their approach relied on Reddit posts rather than clinical data.</p><p>Our study also outperformed all single-task depression prediction studies presented in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, which predominantly used binary classification (ie, depressed vs nondepressed). In contrast, our multitask framework enabled a more nuanced assessment by explicitly predicting the severity of depressive symptoms rather than merely classifying their presence or absence. Although some single-task suicide prediction models reported higher metrics, including models by Chen et al [<xref ref-type="bibr" rid="ref38">38</xref>] (<italic>F</italic><sub>1</sub>-score=0.76), Tsui et al [<xref ref-type="bibr" rid="ref39">39</xref>] (AUC=0.93), and Bouktif et al [<xref ref-type="bibr" rid="ref36">36</xref>] (accuracy=0.94), they used substantially larger datasets (1284 subjects, 45,238 patients, and 3,48,110 posts, respectively) and focused exclusively on single-task prediction. Similarly, Ram&#x00ED;rez-Cifuentes et al [<xref ref-type="bibr" rid="ref40">40</xref>] achieved an AUC of 0.94 for suicide prediction using social media data, which suffered from known limitations, including self-presentation biases, language ambiguities, and an inability to detect offline SR [<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>Our study uniquely applied MTL to simultaneously predict DS and SR using multimodal data from clinical interviews. Unlike prior work that focused on single tasks or unimodal inputs, often derived from electronic health records or social media, our approach captured direct clinical interactions, yielding more authentic behavioral signals. Comparative analyses demonstrated that our model effectively predicted both DS and SR, offering clear advantages over existing methods for this clinically important objective.</p></sec><sec id="s4-2"><title>Theoretical Implications</title><p>This study makes substantial contributions to existing literature from 2 main perspectives. First, this study delineated the efficacy of integrating MML, MTL, and TL in simultaneously identifying DS and SR, thereby advancing the understanding of depression and suicide detection. While existing research, such as [<xref ref-type="bibr" rid="ref25">25</xref>], has explored the impact and importance of MTL in DS and SR prediction, studies have predominantly focused on social media contexts. Limited research has evaluated the effectiveness of MTL in clinical settings. This study addressed this gap through empirical experiments using real-world clinical datasets, demonstrating that the proposed multimodal multitask approach, integrating pretrained embeddings, is applicable to clinical settings.</p><p>Furthermore, our findings underscore that MTL generally enhances model performance, consistent with prior literature (eg, [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]), highlighting the benefits of knowledge sharing across domains [<xref ref-type="bibr" rid="ref25">25</xref>]. However, our experiments also revealed instances of negative transfer [<xref ref-type="bibr" rid="ref71">71</xref>], emphasizing the importance of selecting optimal MTL strategies based on embeddings, tasks, and application scenarios. Moreover, further thoughtful evaluation should consider balancing the costs associated with false positives and false negatives, using more comprehensive metrics.</p><p>Second, we discussed and presented a comparison of popular pretrained models (Longformer and ERNIE-health for text modality, and wav2vec 2.0 and HuBERT for audio modality) to evaluate their effectiveness with clinical data, providing a valuable addition to the existing literature on depression and suicide prediction research. Our findings revealed that ERNIE-health outperformed Longformer in text modality embedding, and wav2vec 2.0 generally surpassed HuBERT, although there were instances where the reverse was true. This underscores the necessity of judicious pretrained model selection and thorough testing for clinical applicability in the future. Nevertheless, we still affirm the efficacy of TL, as even single-task and single-modality models exhibited commendable performance, although our dataset included only 200 samples.</p></sec><sec id="s4-3"><title>Practical Implications</title><p>This study has several important practical implications. First, the persistent challenge of data scarcity has limited progress in both academic research and clinical practice. Through techniques like MML, TL, and MTL, we propose promising solutions. Second, by integrating multimodal data from speech and text and applying TL methods, our approach can facilitate clinical diagnosis with objective and quantitative measurements. This enables a rapid, efficient, and cost-effective assessment of DS and SR based solely on patients&#x2019; verbal disclosures to health care providers. Third, the effectiveness of our method suggests a promising avenue for automated SR detection through the development of innovative tools, thereby making a significant contribution to early suicide prevention efforts.</p></sec><sec id="s4-4"><title>Limitations and Future Research</title><p>This study has certain limitations that warrant further research. First, our dataset of 200 participants (100 patients with depression and 100 healthy individuals) represents a significant limitation that severely constrains the generalizability of our findings to broader populations. Despite implementing cross-validation techniques, this small sample size introduces considerable risks of overfitting, where the model may capture dataset-specific characteristics rather than robust, generalizable patterns for DS and SR detection. This limitation necessitates external validation with larger, more diverse cohorts from different clinical settings and demographic backgrounds to establish the true clinical utility and robustness of our proposed method. The incorporation of larger external datasets is therefore essential to not only enhance robustness but also refine and validate our approach across varied populations.</p><p>Furthermore, addressing data imbalance has emerged as a critical challenge in accurately identifying and classifying depression cases across varying severity levels. Our comprehensive analysis revealed significant performance disparities among &#x201C;none,&#x201D; &#x201C;low/moderate,&#x201D; and &#x201C;high&#x201D; severity subcategories, with particularly pronounced difficulties in classifying &#x201C;low/moderate&#x201D; severity cases (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). This variability underscores the intricate complexity of developing a robust diagnostic approach capable of consistently discerning nuanced variations in DS. Future research should, therefore, focus on advancing MTL strategies that integrate multimodal feature representations with targeted sampling techniques and refined weighting mechanisms to enhance the robustness of model predictive performance across varying severity levels of depression.</p><p>Furthermore, the exploration of diverse fusion strategies and weight adjustments in MTL, along with the investigation of various pretrained models, warrants further investigation to potentially enhance model performance in future studies. However, while our implementation was straightforward, our primary objective was to develop a computationally efficient and effective method that prioritizes resource efficiency. Finally, exploring the applicability of these techniques to a broader spectrum of mental health disorders is essential. This includes leveraging MML and MTL approaches to integrate information across different disorders, thereby expanding the scope of potential applications in mental health diagnostics.</p></sec><sec id="s4-5"><title>Conclusion</title><p>Early detection and accurate diagnosis are crucial for implementing timely interventions and alleviating the societal and economic burdens associated with mental health conditions. This study proposes an effective approach to improving model performance by integrating MTL, MML, and TL for concurrent depression and suicide detection. Our empirical findings, obtained by fine-tuning MTL models on clinical datasets, provide compelling evidence for the effectiveness of integrating MTL, MML, and TL methods in addressing mental health tasks. However, we advocate for cautious MTL implementation to mitigate potential negative transfer effects. Additionally, we recommend careful consideration for the selection of pretrained models and rigorous validation to ensure their clinical applicability. Our proposed methods offer a promising pathway for future research and clinical applications in mental health diagnostics.</p></sec></sec></body><back><ack><p>This research was supported in part by the Ministry of Science and Technology (grant numbers: MOST 110-2314-B-367-001-, MOST 111-2410-H-008-026-MY2, and MOST 111-2314-B-367-001-MY3).</p></ack><notes><sec><title>Data Availability</title><p>The complete dataset and complete code will be made available on the Open Science Framework platform as soon as the work gets accepted for publication.</p></sec></notes><fn-group><fn fn-type="con"><p>CCS (pures1000@yahoo.com.tw) and ILL (05528@cych.org.tw) are co-corresponding authors for this article.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb2">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb3">DS</term><def><p>depression severity</p></def></def-item><def-item><term id="abb4">FC</term><def><p>fully connected</p></def></def-item><def-item><term id="abb5">HAMD-17</term><def><p>Hamilton Depression Rating Scale-17</p></def></def-item><def-item><term id="abb6">MML</term><def><p>multimodal learning</p></def></def-item><def-item><term id="abb7">MTL</term><def><p>multitask learning</p></def></def-item><def-item><term id="abb8">SR</term><def><p>suicide risk</p></def></def-item><def-item><term id="abb9">STL</term><def><p>single-task learning</p></def></def-item><def-item><term id="abb10">TL</term><def><p>transfer learning</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Depressive disorder (depression)</article-title><source>World Health Organization (WHO)</source><access-date>2025-05-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/news-room/fact-sheets/detail/depression">https://www.who.int/news-room/fact-sheets/detail/depression</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burcusa</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Iacono</surname><given-names>WG</given-names> </name></person-group><article-title>Risk for recurrence in depression</article-title><source>Clin Psychol Rev</source><year>2007</year><month>12</month><volume>27</volume><issue>8</issue><fpage>959</fpage><lpage>985</lpage><pub-id pub-id-type="doi">10.1016/j.cpr.2007.02.005</pub-id><pub-id pub-id-type="medline">17448579</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>L&#x00E9;pine</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Briley</surname><given-names>M</given-names> </name></person-group><article-title>The increasing burden of depression</article-title><source>Neuropsychiatr Dis Treat</source><year>2011</year><volume>7</volume><issue>Suppl 1</issue><fpage>3</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.2147/NDT.S19617</pub-id><pub-id pub-id-type="medline">21750622</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Laget</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sofia</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bolognini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Plancherel</surname><given-names>B</given-names> </name><name name-style="western"><surname>Halfon</surname><given-names>O</given-names> </name><name name-style="western"><surname>St&#x00E9;phan</surname><given-names>P</given-names> </name></person-group><article-title>Use of a multidimensional assessment tool in a psychiatric adolescent care unit</article-title><source>J Eval Clin Pract</source><year>2006</year><month>10</month><volume>12</volume><issue>5</issue><fpage>549</fpage><lpage>558</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2753.2006.00669.x</pub-id><pub-id pub-id-type="medline">16987117</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thagard</surname><given-names>P</given-names> </name><name name-style="western"><surname>Larocque</surname><given-names>L</given-names> </name></person-group><article-title>Mental health assessment: inference, explanation, and coherence</article-title><source>J Eval Clin Pract</source><year>2018</year><month>06</month><volume>24</volume><issue>3</issue><fpage>649</fpage><lpage>654</lpage><pub-id pub-id-type="doi">10.1111/jep.12885</pub-id><pub-id pub-id-type="medline">29380474</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tramonti</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ferrante</surname><given-names>B</given-names> </name><name name-style="western"><surname>Palmer</surname><given-names>H</given-names> </name></person-group><article-title>A consulting room with a view: psychotherapy and the ecological context</article-title><source>J Eval Clin Pract</source><year>2024</year><month>09</month><volume>30</volume><issue>6</issue><fpage>1113</fpage><lpage>1122</lpage><pub-id pub-id-type="doi">10.1111/jep.14030</pub-id><pub-id pub-id-type="medline">38818691</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Low</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Bentley</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>SS</given-names> </name></person-group><article-title>Automated assessment of psychiatric disorders using speech: a systematic review</article-title><source>Laryngoscope Investig Otolaryngol</source><year>2020</year><month>02</month><volume>5</volume><issue>1</issue><fpage>96</fpage><lpage>116</lpage><pub-id pub-id-type="doi">10.1002/lio2.354</pub-id><pub-id pub-id-type="medline">32128436</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Chehil</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kutcher</surname><given-names>S</given-names> </name></person-group><source>Suicide Risk Management: A Manual for Health Professionals</source><year>2012</year><publisher-name>John Wiley &#x0026; Sons</publisher-name><pub-id pub-id-type="doi">10.1002/9781119953128</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>O&#x2019;Connor</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Perdue</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Coppola</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Henninger</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>RG</given-names> </name><name name-style="western"><surname>Gaynes</surname><given-names>BN</given-names> </name></person-group><article-title>Depression and suicide risk screening: updated evidence report and systematic review for the US Preventive Services Task Force</article-title><source>JAMA</source><year>2023</year><month>06</month><day>20</day><volume>329</volume><issue>23</issue><fpage>2068</fpage><lpage>2085</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.7787</pub-id><pub-id pub-id-type="medline">37338873</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Richards</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Whiteside</surname><given-names>U</given-names> </name><name name-style="western"><surname>Ludman</surname><given-names>EJ</given-names> </name><etal/></person-group><article-title>Understanding why patients may not report suicidal ideation at a health care visit prior to a suicide attempt: a qualitative study</article-title><source>Psychiatr Serv</source><year>2019</year><month>01</month><day>1</day><volume>70</volume><issue>1</issue><fpage>40</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1176/appi.ps.201800342</pub-id><pub-id pub-id-type="medline">30453860</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iyer</surname><given-names>R</given-names> </name><name name-style="western"><surname>Nedeljkovic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>D</given-names> </name></person-group><article-title>Using voice biomarkers to classify suicide risk in adult telehealth callers: retrospective observational study</article-title><source>JMIR Ment Health</source><year>2022</year><month>08</month><day>15</day><volume>9</volume><issue>8</issue><fpage>e39807</fpage><pub-id pub-id-type="doi">10.2196/39807</pub-id><pub-id pub-id-type="medline">35969444</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>SH</given-names> </name><name name-style="western"><surname>LePendu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Iyer</surname><given-names>SV</given-names> </name><name name-style="western"><surname>Tai-Seale</surname><given-names>M</given-names> </name><name name-style="western"><surname>Carrell</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name></person-group><article-title>Toward personalizing treatment for depression: predicting diagnosis and severity</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>6</issue><fpage>1069</fpage><lpage>1075</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2014-002733</pub-id><pub-id pub-id-type="medline">24988898</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nadif</surname><given-names>M</given-names> </name><name name-style="western"><surname>Role</surname><given-names>F</given-names> </name></person-group><article-title>Unsupervised and self-supervised deep learning approaches for biomedical text mining</article-title><source>Brief Bioinform</source><year>2021</year><month>03</month><day>22</day><volume>22</volume><issue>2</issue><fpage>1592</fpage><lpage>1603</lpage><pub-id pub-id-type="doi">10.1093/bib/bbab016</pub-id><pub-id pub-id-type="medline">33569575</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lou</surname><given-names>X</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Kwok</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>C</given-names> </name></person-group><article-title>EHR-HGCN: an enhanced hybrid approach for text classification using heterogeneous graph convolutional networks in electronic health records</article-title><source>IEEE J Biomed Health Inform</source><year>2023</year><volume>28</volume><issue>3</issue><fpage>1668</fpage><lpage>1679</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2023.3346210</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rejaibi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Komaty</surname><given-names>A</given-names> </name><name name-style="western"><surname>Meriaudeau</surname><given-names>F</given-names> </name><name name-style="western"><surname>Agrebi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Othmani</surname><given-names>A</given-names> </name></person-group><article-title>MFCC-based recurrent neural network for automatic clinical depression recognition and assessment from speech</article-title><source>Biomed Signal Process Control</source><year>2022</year><month>01</month><volume>71</volume><fpage>103107</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2021.103107</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cummins</surname><given-names>N</given-names> </name><name name-style="western"><surname>Scherer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Krajewski</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schnieder</surname><given-names>S</given-names> </name><name name-style="western"><surname>Epps</surname><given-names>J</given-names> </name><name name-style="western"><surname>Quatieri</surname><given-names>TF</given-names> </name></person-group><article-title>A review of depression and suicide risk assessment using speech analysis</article-title><source>Speech Commun</source><year>2015</year><month>07</month><volume>71</volume><fpage>10</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1016/j.specom.2015.03.004</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Homan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gabi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Klee</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Linguistic features of suicidal thoughts and behaviors: a systematic review</article-title><source>Clin Psychol Rev</source><year>2022</year><month>07</month><volume>95</volume><fpage>102161</fpage><pub-id pub-id-type="doi">10.1016/j.cpr.2022.102161</pub-id><pub-id pub-id-type="medline">35636131</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name></person-group><article-title>Towards automatic depression detection: a BiLSTM/1D CNN-based model</article-title><source>Appl Sci (Basel)</source><year>2020</year><volume>10</volume><issue>23</issue><fpage>8701</fpage><pub-id pub-id-type="doi">10.3390/app10238701</pub-id><pub-id pub-id-type="medline">33520293</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Jang</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>KY</given-names> </name><name name-style="western"><surname>Park</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>HC</given-names> </name></person-group><article-title>Automatic depression detection using smartphone-based text-dependent speech signals: deep convolutional neural network approach</article-title><source>J Med Internet Res</source><year>2023</year><month>01</month><day>25</day><volume>25</volume><fpage>e34474</fpage><pub-id pub-id-type="doi">10.2196/34474</pub-id><pub-id pub-id-type="medline">36696160</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bennett-Poynter</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kundurthi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Besa</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Harnessing digital health data for suicide prevention and care: a rapid review</article-title><source>Digit Health</source><year>2025</year><volume>11</volume><fpage>20552076241308615</fpage><pub-id pub-id-type="doi">10.1177/20552076241308615</pub-id><pub-id pub-id-type="medline">39996066</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kirtley</surname><given-names>OJ</given-names> </name><name name-style="western"><surname>van Mens</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hoogendoorn</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kapur</surname><given-names>N</given-names> </name><name name-style="western"><surname>de Beurs</surname><given-names>D</given-names> </name></person-group><article-title>Translating promise into practice: a review of machine learning in suicide research and prevention</article-title><source>Lancet Psychiatry</source><year>2022</year><month>03</month><volume>9</volume><issue>3</issue><fpage>243</fpage><lpage>252</lpage><pub-id pub-id-type="doi">10.1016/S2215-0366(21)00254-6</pub-id><pub-id pub-id-type="medline">35183281</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nock</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Hwang</surname><given-names>I</given-names> </name><name name-style="western"><surname>Sampson</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Kessler</surname><given-names>RC</given-names> </name></person-group><article-title>Mental disorders, comorbidity and suicidal behavior: results from the National Comorbidity Survey Replication</article-title><source>Mol Psychiatry</source><year>2010</year><month>08</month><volume>15</volume><issue>8</issue><fpage>868</fpage><lpage>876</lpage><pub-id pub-id-type="doi">10.1038/mp.2009.29</pub-id><pub-id pub-id-type="medline">19337207</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Heeringen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>JJ</given-names> </name></person-group><article-title>The neurobiology of suicide</article-title><source>Lancet Psychiatry</source><year>2014</year><month>06</month><volume>1</volume><issue>1</issue><fpage>63</fpage><lpage>72</lpage><pub-id pub-id-type="doi">10.1016/S2215-0366(14)70220-2</pub-id><pub-id pub-id-type="medline">26360403</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hawton</surname><given-names>K</given-names> </name><name name-style="western"><surname>Saunders</surname><given-names>KE</given-names> </name><name name-style="western"><surname>O&#x2019;Connor</surname><given-names>RC</given-names> </name></person-group><article-title>Self-harm and suicide in adolescents</article-title><source>The Lancet</source><year>2012</year><month>06</month><volume>379</volume><issue>9834</issue><fpage>2373</fpage><lpage>2382</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(12)60322-5</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Benton</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mitchell</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hovy</surname><given-names>D</given-names> </name></person-group><article-title>Multi-task learning for mental health using social media text</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 10, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1712.03538</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Teng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tateyama</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YW</given-names> </name></person-group><article-title>Multi-modal and multi-task depression detection with sentiment assistance</article-title><conf-name>2024 IEEE International Conference on Consumer Electronics (ICCE)</conf-name><conf-date>Jan 6-8, 2024</conf-date><conf-loc>Las Vegas, NV, USA</conf-loc><pub-id pub-id-type="doi">10.1109/ICCE59016.2024.10444213</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>Z</given-names> </name></person-group><article-title>Time perspective-enhanced suicidal ideation detection using multi-task learning</article-title><source>IJNDI</source><year>2024</year><volume>3</volume><issue>2</issue><fpage>100011</fpage><pub-id pub-id-type="doi">10.53941/ijndi.2024.100011</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qureshi</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Dias</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hasanuzzaman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>S</given-names> </name></person-group><article-title>Improving depression level estimation by concurrently learning emotion intensity</article-title><source>IEEE Comput Intell Mag</source><year>2020</year><volume>15</volume><issue>3</issue><fpage>47</fpage><lpage>59</lpage><pub-id pub-id-type="doi">10.1109/MCI.2020.2998234</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buddhitha</surname><given-names>P</given-names> </name><name name-style="western"><surname>Inkpen</surname><given-names>D</given-names> </name></person-group><article-title>Multi-task learning to detect suicide ideation and mental disorders among social media users</article-title><source>Front Res Metr Anal</source><year>2023</year><volume>8</volume><fpage>1152535</fpage><pub-id pub-id-type="doi">10.3389/frma.2023.1152535</pub-id><pub-id pub-id-type="medline">37138946</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ophir</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tikochinski</surname><given-names>R</given-names> </name><name name-style="western"><surname>Asterhan</surname><given-names>CSC</given-names> </name><name name-style="western"><surname>Sisso</surname><given-names>I</given-names> </name><name name-style="western"><surname>Reichart</surname><given-names>R</given-names> </name></person-group><article-title>Deep neural networks detect suicide risk from textual facebook posts</article-title><source>Sci Rep</source><year>2020</year><month>10</month><day>7</day><volume>10</volume><issue>1</issue><fpage>16685</fpage><pub-id pub-id-type="doi">10.1038/s41598-020-73917-0</pub-id><pub-id pub-id-type="medline">33028921</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qureshi</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hasanuzzaman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dias</surname><given-names>G</given-names> </name></person-group><article-title>Multitask representation learning for multimodal estimation of depression level</article-title><source>IEEE Intell Syst</source><year>2019</year><volume>34</volume><issue>5</issue><fpage>45</fpage><lpage>52</lpage><pub-id pub-id-type="doi">10.1109/MIS.2019.2925204</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Boigne</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liyanage</surname><given-names>B</given-names> </name><name name-style="western"><surname>&#x00D6;strem</surname><given-names>T</given-names> </name></person-group><article-title>Recognizing more emotions with less data using self-supervised transfer learning</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 11, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2011.05585</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Palanisamy</surname><given-names>K</given-names> </name><name name-style="western"><surname>Singhania</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>A</given-names> </name></person-group><article-title>Rethinking CNN models for audio classification</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 22, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2007.11154</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nykoniuk</surname><given-names>M</given-names> </name><name name-style="western"><surname>Basystiuk</surname><given-names>O</given-names> </name><name name-style="western"><surname>Shakhovska</surname><given-names>N</given-names> </name><name name-style="western"><surname>Melnykova</surname><given-names>N</given-names> </name></person-group><article-title>Multimodal data fusion for depression detection approach</article-title><source>Computation</source><year>2025</year><volume>13</volume><issue>1</issue><fpage>9</fpage><pub-id pub-id-type="doi">10.3390/computation13010009</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Philip Thekkekara</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yongchareon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liesaputra</surname><given-names>V</given-names> </name></person-group><article-title>An attention-based CNN-BiLSTM model for depression detection on social media text</article-title><source>Expert Syst Appl</source><year>2024</year><month>09</month><volume>249</volume><fpage>123834</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2024.123834</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bouktif</surname><given-names>S</given-names> </name><name name-style="western"><surname>Khanday</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ouni</surname><given-names>A</given-names> </name></person-group><article-title>Explainable predictive model for suicidal ideation during COVID-19: social media discourse study</article-title><source>J Med Internet Res</source><year>2025</year><month>01</month><day>17</day><volume>27</volume><fpage>e65434</fpage><pub-id pub-id-type="doi">10.2196/65434</pub-id><pub-id pub-id-type="medline">39823631</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Su</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>X</given-names> </name><name name-style="western"><surname>Su</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name></person-group><article-title>Acoustic features for identifying suicide risk in crisis hotline callers: machine learning approach</article-title><source>J Med Internet Res</source><year>2025</year><month>04</month><day>14</day><volume>27</volume><fpage>e67772</fpage><pub-id pub-id-type="doi">10.2196/67772</pub-id><pub-id pub-id-type="medline">40228243</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Song</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Tong</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>G</given-names> </name></person-group><article-title>Deep learning and large language models for audio and text analysis in predicting suicidal acts in Chinese psychological support hotlines</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 10, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.06164</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsui</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ruiz</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Natural language processing and machine learning of electronic health records for prediction of first-time suicide attempts</article-title><source>JAMIA Open</source><year>2021</year><month>01</month><volume>4</volume><issue>1</issue><fpage>ooab011</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooab011</pub-id><pub-id pub-id-type="medline">33758800</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ram&#x00ED;rez-Cifuentes</surname><given-names>D</given-names> </name><name name-style="western"><surname>Freire</surname><given-names>A</given-names> </name><name name-style="western"><surname>Baeza-Yates</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Detection of suicidal ideation on social media: multimodal, relational, and behavioral analysis</article-title><source>J Med Internet Res</source><year>2020</year><month>07</month><day>7</day><volume>22</volume><issue>7</issue><fpage>e17758</fpage><pub-id pub-id-type="doi">10.2196/17758</pub-id><pub-id pub-id-type="medline">32673256</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Crawshaw</surname><given-names>M</given-names> </name></person-group><article-title>Multi-task learning with deep neural networks: a survey</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 10, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2009.09796</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Q</given-names> </name></person-group><article-title>A survey on multi-task learning</article-title><source>IEEE Trans Knowl Data Eng</source><year>2021</year><volume>34</volume><issue>12</issue><fpage>5586</fpage><lpage>5609</lpage><pub-id pub-id-type="doi">10.1109/TKDE.2021.3070203</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thung</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Wee</surname><given-names>CY</given-names> </name></person-group><article-title>A brief review on multi-task learning</article-title><source>Multimed Tools Appl</source><year>2018</year><month>11</month><volume>77</volume><issue>22</issue><fpage>29705</fpage><lpage>29725</lpage><pub-id pub-id-type="doi">10.1007/s11042-018-6463-x</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dumpala</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Rempel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dikaios</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sajjadian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Uher</surname><given-names>R</given-names> </name><name name-style="western"><surname>Oore</surname><given-names>S</given-names> </name></person-group><article-title>Estimating severity of depression from acoustic features and embeddings of natural speech</article-title><conf-name>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name><conf-date>Jun 6-11, 2021</conf-date><conf-loc>Toronto, ON, Canada</conf-loc><pub-id pub-id-type="doi">10.1109/ICASSP39728.2021.9414129</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Li</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Fine-grained depression analysis based on Chinese micro-blog reviews</article-title><source>Inf Process Manag</source><year>2021</year><month>11</month><volume>58</volume><issue>6</issue><fpage>102681</fpage><pub-id pub-id-type="doi">10.1016/j.ipm.2021.102681</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ghosh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ekbal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bhattacharyya</surname><given-names>P</given-names> </name></person-group><article-title>A multitask framework to detect depression, sentiment and multi-label emotion from suicide notes</article-title><source>Cogn Comput</source><year>2022</year><month>01</month><volume>14</volume><issue>1</issue><fpage>110</fpage><lpage>129</lpage><pub-id pub-id-type="doi">10.1007/s12559-021-09828-7</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hamilton</surname><given-names>M</given-names> </name></person-group><article-title>A rating scale for depression</article-title><source>J Neurol Neurosurg Psychiatry</source><year>1960</year><month>02</month><volume>23</volume><issue>1</issue><fpage>56</fpage><lpage>62</lpage><pub-id pub-id-type="doi">10.1136/jnnp.23.1.56</pub-id><pub-id pub-id-type="medline">14399272</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patterson</surname><given-names>WM</given-names> </name><name name-style="western"><surname>Dohn</surname><given-names>HH</given-names> </name><name name-style="western"><surname>Bird</surname><given-names>J</given-names> </name><name name-style="western"><surname>Patterson</surname><given-names>GA</given-names> </name></person-group><article-title>Evaluation of suicidal patients: the SAD PERSONS scale</article-title><source>Psychosomatics</source><year>1983</year><month>04</month><volume>24</volume><issue>4</issue><fpage>343</fpage><lpage>345</lpage><pub-id pub-id-type="doi">10.1016/S0033-3182(83)73213-5</pub-id><pub-id pub-id-type="medline">6867245</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Baevski</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Auli</surname><given-names>M</given-names> </name></person-group><article-title>wav2vec 2.0: a framework for self-supervised learning of speech representations</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 20, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2006.11477</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hsu</surname><given-names>WN</given-names> </name><name name-style="western"><surname>Bolte</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>YHH</given-names> </name><name name-style="western"><surname>Lakhotia</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salakhutdinov</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>A</given-names> </name></person-group><article-title>HuBERT: self-supervised speech representation learning by masked prediction of hidden units</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2021</year><volume>29</volume><fpage>3451</fpage><lpage>3460</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2021.3122291</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Beltagy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>A</given-names> </name></person-group><article-title>Longformer: the long-document transformer</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 10, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2004.05150</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Depression recognition using voice-based pre-training model</article-title><source>Sci Rep</source><year>2024</year><volume>14</volume><issue>1</issue><fpage>12734</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-63556-0</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Song</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Fine-grained speech sentiment analysis in Chinese psychological support hotlines based on large-scale pre-trained model</article-title><source>arXiv</source><comment>Preprint posted online on  May 7, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.04128</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dumpala</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Dikaios</surname><given-names>K</given-names> </name><name name-style="western"><surname>Nunes</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rudzicz</surname><given-names>F</given-names> </name><name name-style="western"><surname>Uher</surname><given-names>R</given-names> </name><name name-style="western"><surname>Oore</surname><given-names>S</given-names> </name></person-group><article-title>Self-supervised embeddings for detecting individual symptoms of depression</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.17229</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kurtz</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Driesse</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Early detection of cognitive decline using voice assistant commands</article-title><source>Proc IEEE Int Conf Acoust Speech Signal Process</source><year>2023</year><month>06</month><volume>2023</volume><fpage>1</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.1109/icassp49357.2023.10095825</pub-id><pub-id pub-id-type="medline">40963950</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Building chinese biomedical language models via multi-level text discrimination</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 14, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2110.07244</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>H</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>Y</given-names> </name></person-group><article-title>MMBERT: a unified framework for biomedical named entity recognition</article-title><source>Med Biol Eng Comput</source><year>2024</year><month>01</month><volume>62</volume><issue>1</issue><fpage>327</fpage><lpage>341</lpage><pub-id pub-id-type="doi">10.1007/s11517-023-02934-8</pub-id><pub-id pub-id-type="medline">37833517</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atmaja</surname><given-names>BT</given-names> </name><name name-style="western"><surname>Sasou</surname><given-names>A</given-names> </name><name name-style="western"><surname>Akagi</surname><given-names>M</given-names> </name></person-group><article-title>Survey on bimodal speech emotion recognition from acoustic and linguistic information fusion</article-title><source>Speech Commun</source><year>2022</year><month>05</month><volume>140</volume><fpage>11</fpage><lpage>28</lpage><pub-id pub-id-type="doi">10.1016/j.specom.2022.03.002</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Pareek</surname><given-names>A</given-names> </name><name name-style="western"><surname>Seyyedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>I</given-names> </name><name name-style="western"><surname>Lungren</surname><given-names>MP</given-names> </name></person-group><article-title>Fusion of medical imaging and electronic health records using deep learning: a systematic review and implementation guidelines</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><issue>1</issue><fpage>136</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-00341-z</pub-id><pub-id pub-id-type="medline">33083571</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ross</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Li</surname><given-names>SZ</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>A</given-names> </name></person-group><article-title>Fusion, feature-level</article-title><source>Encyclopedia of Biometrics</source><year>2009</year><publisher-name>Springer</publisher-name><fpage>597</fpage><lpage>602</lpage><pub-id pub-id-type="doi">10.1007/978-0-387-73003-5_157</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="web"><source>PyTorch</source><access-date>2025-10-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pytorch.org/">https://pytorch.org/</ext-link></comment></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="web"><source>Hugging Face</source><access-date>2025-10-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/">https://huggingface.co/</ext-link></comment></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="web"><source>Podcastle</source><access-date>2025-10-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://podcastle.ai/">https://podcastle.ai/</ext-link></comment></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="web"><article-title>8 best free AI noise reduction tools for crystal clear audio in 2025</article-title><source>AIMojo</source><access-date>2025-05-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aimojo.io/free-ai-noise-reduction-tools/">https://aimojo.io/free-ai-noise-reduction-tools/</ext-link></comment></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="web"><article-title>What is noise reduction and how do i use it?</article-title><source>Podcastle</source><access-date>2025-05-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://help.podcastle.ai/en/articles/8912022-what-is-noise-reduction-and-how-do-i-use-it">https://help.podcastle.ai/en/articles/8912022-what-is-noise-reduction-and-how-do-i-use-it</ext-link></comment></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Rodrigues Makiuchi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Warnita</surname><given-names>T</given-names> </name><name name-style="western"><surname>Uto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shinoda</surname><given-names>K</given-names> </name></person-group><article-title>Multimodal fusion of BERT-CNN and gated CNN representations for depression detection</article-title><conf-name>9th International on Audio/Visual Emotion Challenge and Workshop</conf-name><conf-date>Oct 21, 2019</conf-date><conf-loc>Nice, France</conf-loc><pub-id pub-id-type="doi">10.1145/3347320.3357694</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>T</given-names> </name><etal/></person-group><article-title>METTS: multilingual emotional text-to-speech by cross-speaker and cross-lingual emotion transfer</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2024</year><volume>32</volume><fpage>1506</fpage><lpage>1518</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2024.3363444</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kalyan</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Sangeetha</surname><given-names>S</given-names> </name></person-group><article-title>SECNLP: A survey of embeddings in clinical natural language processing</article-title><source>J Biomed Inform</source><year>2020</year><month>01</month><volume>101</volume><fpage>103323</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2019.103323</pub-id><pub-id pub-id-type="medline">31711972</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fawcett</surname><given-names>T</given-names> </name></person-group><article-title>An introduction to ROC analysis</article-title><source>Pattern Recognit Lett</source><year>2006</year><month>06</month><volume>27</volume><issue>8</issue><fpage>861</fpage><lpage>874</lpage><pub-id pub-id-type="doi">10.1016/j.patrec.2005.10.010</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saito</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rehmsmeier</surname><given-names>M</given-names> </name></person-group><article-title>The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets</article-title><source>PLoS ONE</source><year>2015</year><volume>10</volume><issue>3</issue><fpage>e0118432</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0118432</pub-id><pub-id pub-id-type="medline">25738806</pub-id></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhuang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Qi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Duan</surname><given-names>K</given-names> </name><etal/></person-group><article-title>A comprehensive survey on transfer learning</article-title><source>Proc IEEE</source><year>2020</year><volume>109</volume><issue>1</issue><fpage>43</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1109/JPROC.2020.3004555</pub-id></nlm-citation></ref><ref id="ref72"><label>72</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cusack</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Ralph-Nearman</surname><given-names>C</given-names> </name><name name-style="western"><surname>Christian</surname><given-names>C</given-names> </name><name name-style="western"><surname>Fisher</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Levinson</surname><given-names>CA</given-names> </name></person-group><article-title>Understanding heterogeneity, comorbidity, and variability in depression: Idiographic models and depression outcomes</article-title><source>J Affect Disord</source><year>2024</year><month>07</month><day>1</day><volume>356</volume><fpage>248</fpage><lpage>256</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2024.04.034</pub-id><pub-id pub-id-type="medline">38608769</pub-id></nlm-citation></ref><ref id="ref73"><label>73</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Melhem</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Porta</surname><given-names>G</given-names> </name><name name-style="western"><surname>Oquendo</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>Severity and variability of depression symptoms predicting suicide attempt in high-risk individuals</article-title><source>JAMA Psychiatry</source><year>2019</year><month>06</month><day>1</day><volume>76</volume><issue>6</issue><fpage>603</fpage><lpage>613</lpage><pub-id pub-id-type="doi">10.1001/jamapsychiatry.2018.4513</pub-id><pub-id pub-id-type="medline">30810713</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional data to support the findings of the study.</p><media xlink:href="medinform_v13i1e66907_app1.docx" xlink:title="DOCX File, 28 KB"/></supplementary-material></app-group></back></article>