<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e70753</article-id><article-id pub-id-type="doi">10.2196/70753</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Analyzing Sleep Behavior Using BERT-BiLSTM and Fine-Tuned GPT-2 Sentiment Classification: Comparison Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Deng</surname><given-names>Yihan</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>van der Meer</surname><given-names>Julia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tzovara</surname><given-names>Athina</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schmidt</surname><given-names>Markus</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bassetti</surname><given-names>Claudio</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Denecke</surname><given-names>Kerstin</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Institute of Computer Science, University of Bern</institution><addr-line>Neubr&#x00FC;ckstrasse 10</addr-line><addr-line>Bern</addr-line><country>Switzerland</country></aff><aff id="aff2"><institution>Institute Patient-centred Digital Health, School of Engineering and Computer Science, Bern University of Applied Sciences</institution><addr-line>Biel/Bienne</addr-line><country>Switzerland</country></aff><aff id="aff3"><institution>Department of Neurology, University Hospital of Bern</institution><addr-line>Bern</addr-line><country>Switzerland</country></aff><aff id="aff4"><institution>Center for Experimental Neurology - Sleep Wake Epilepsy Center - NeuroTec, Department of Neurology, Inselspital Bern, University Hospital, University of Bern</institution><addr-line>Bern</addr-line><country>Switzerland</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Klann</surname><given-names>Jeffrey</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Teodoro</surname><given-names>Douglas</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Immanuvel Arockiasamy</surname><given-names>Jesu Marcus</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chatzimina</surname><given-names>Maria</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gandhi</surname><given-names>Meghal</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Jain</surname><given-names>Praphula Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>YenPin</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yihan Deng, MS, Institute of Computer Science, University of Bern, Neubr&#x00FC;ckstrasse 10, Bern, Switzerland, +41 316848426; <email>yihan.deng@students.unibe.ch</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>10</day><month>11</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e70753</elocation-id><history><date date-type="received"><day>31</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>30</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>30</day><month>09</month><year>2025</year></date></history><copyright-statement>&#x00A9; Yihan Deng, Julia van der Meer, Athina Tzovara, Markus Schmidt, Claudio Bassetti, Kerstin Denecke. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 10.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e70753"/><abstract><sec><title>Background</title><p>The diagnosis of sleep disorders presents a challenging landscape, characterized by the complex nature of their assessment and the often divergent views between objective clinical assessment and subjective patient experience. This study explores the interplay between these perspectives, focusing on the variability of individual perceptions of sleep quality and latency.</p></sec><sec><title>Objective</title><p>Our primary goal was to investigate the alignment, or lack thereof, between subjective experiences and objective measures in the assessment of sleep disorders.</p></sec><sec sec-type="methods"><title>Methods</title><p>To study this, we developed an aspect-based sentiment analysis method for clinical narratives: using large language models (Falcon 40B and Mixtral 8X7B), we are identifying entity groups of 3 aspects related to sleep behavior (day sleepiness, sleep quality, and fatigue). To phrases referring to these aspects, we are assigning sentiment values between 0 and 1 using a BERT-BiLSTM&#x2013;based approach (accuracy 78%) and a fine-tuned GPT-2 sentiment classifier (accuracy 87%).</p></sec><sec sec-type="results"><title>Results</title><p>In a cohort of 100 patients with complete subjective (Karolinska Sleepiness Scale [KSS]) and objective (Multiple Sleep Latency Test [MSLT]) assessments, approximately 15% exhibited notable discrepancies between perceived and measured levels of daytime sleepiness. A paired-sample <italic>t</italic> test comparing KSS scores to MSLT latencies approached statistical significance (<italic>t</italic><sub>99</sub>=2.456; <italic>P</italic>=.06), suggesting a potential misalignment between subjective reports and physiological markers. In contrast, the comparison using text-derived sentiment scores revealed a statistically significant divergence (<italic>t</italic><sub>99</sub>=2.324; <italic>P</italic>=.047), indicating that clinical narratives may more reliably capture discrepancies in sleepiness perception. These results underscore the importance of integrating multiple subjective sources, with an emphasis on narrative free text, in the assessment of domains such as fatigue and daytime sleepiness&#x2014;where standardized measures may not fully reflect the patient&#x2019;s lived experience.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our method has potential in uncovering critical insights into patient self-perception versus clinical evaluations, which enables clinicians to identify patients requiring objective verification of self-reported symptoms.</p></sec></abstract><kwd-group><kwd>sleep disorder</kwd><kwd>clinical documentation</kwd><kwd>free text</kwd><kwd>sentiment analysis</kwd><kwd>opinion discrepancy</kwd><kwd>LLM</kwd><kwd>prompting</kwd><kwd>supervised fine-tuning</kwd><kwd>large language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Overview</title><p>The interplay between sleep quality, daytime fatigue, and daytime sleepiness is critical for understanding and diagnosing sleep disorders. Sleep quality&#x2014;a multifaceted concept encompassing depth, continuity, and restorative nature of sleep [<xref ref-type="bibr" rid="ref1">1</xref>]&#x2014;is commonly assessed through polysomnography (PSG) or questionnaires [<xref ref-type="bibr" rid="ref2">2</xref>]. Daytime fatigue significantly impacts quality of life [<xref ref-type="bibr" rid="ref3">3</xref>], while daytime sleepiness reflects the propensity for unintended sleep episodes.</p><p>Past studies have shown inconsistencies between subjective experiences and objective measures. For example, Zavecz et al [<xref ref-type="bibr" rid="ref4">4</xref>] linked self-reported sleep quality to cognitive performance, while O&#x2019;Donnell et al [<xref ref-type="bibr" rid="ref5">5</xref>] reported poor alignment between PSG outcomes and patient-reported sleep quality. Similarly, Aurora et al [<xref ref-type="bibr" rid="ref6">6</xref>] analyzed how well Epworth Sleepiness Scale (ESS) scores correspond to Multiple Sleep Latency Test (MSLT) values. In psychiatric populations, similar dissociations have been reported between perceived and physiologically measured fatigue, as demonstrated by Stanyte et al [<xref ref-type="bibr" rid="ref7">7</xref>] who found that individuals with anxiety and mood disorders often exhibit mismatched subjective fatigue ratings and objective sleep parameters.</p><p>Critically, objective metrics such as sleep onset latency (SOL) and total sleep time (TST) require disorder-contextual interpretation. Valko et al [<xref ref-type="bibr" rid="ref8">8</xref>] demonstrate that shortened SOL indicates pathological sleep-wake fragmentation in narcolepsy but signifies health in normal sleepers, while subjectively prolonged SOL in insomnia reflects hyperarousal despite normal PSG. Similarly, fatigue&#x2014;though quantifiable via scales such as the Fatigue Severity Scale (FSS)&#x2014;lacks consistent PSG correlates across disorders, complicating objectification[<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>Despite these assessments being routinely documented clinically, no study has leveraged real-world clinical records to analyze subjective-objective alignment across disorders.</p><p>To address this gap, we propose a novel natural language processing (NLP) framework for modeling subjective-objective alignment through aspect-based sentiment analysis. Our system extracts (1) patient-reported experiences, (2) objective test results (eg, MSLT and PSG), and (3) clinician commentary on alignment or mismatch between the two. This triadic analysis allows for identifying patterns of misperception&#x2014;for instance, patients who perceive severe sleep onset problems despite normal PSG values.</p><p>Unlike prior work that focused on isolated biomarkers such as Apnea-Hypopnea Index (AHI) or ESS scores, our approach uses language models to extract detailed clinical reasoning from documentation. This allows identification of emerging &#x201C;sleep-wake discrepancy&#x201D; phenotypes, such as patients with objectively normal sleep but persistently poor subjective reports.</p></sec><sec id="s1-2"><title>Sentiment Analysis in Clinical Text</title><p>Sentiment analysis in medical and clinical text has received increasing attention over the past decade [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Clinical documents often contain detailed descriptions of a patient&#x2019;s health status, including observations, diagnostic findings, and treatment plans. Analyzing this information is important for determining whether patient outcomes are improving or deteriorating, and for assessing the overall impact of a condition on the patient&#x2019;s well-being. In addition to objective clinical data, patients&#x2019; self-reported experiences may also offer useful indicators that complement medical assessments.</p><p>Recent developments in pretrained language models, such as those introduced by Devlin et al [<xref ref-type="bibr" rid="ref12">12</xref>], have enabled the use of BERT-based architectures for sentiment analysis across multiple languages and granularities. In our work, we define sentiment across 5 discrete levels, which facilitates scaling into a normalized range between 0 and 1. Behera et al [<xref ref-type="bibr" rid="ref13">13</xref>] presented a model combining Word2Vec embeddings with convolutional and recurrent layers (convolutional neural network and long short-term memory), achieving 91.2% accuracy. Their method also includes polarity normalization to improve classification performance. Other established techniques for sentiment and aspect identification in English-language texts include BiLSTM [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], TextCNN [<xref ref-type="bibr" rid="ref15">15</xref>], TextRNN [<xref ref-type="bibr" rid="ref16">16</xref>], TextRCNN [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>], and DPCNN [<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>With the emergence of transformer-based models, research has increasingly focused on controlling language model behavior during fine-tuning, inference, and output postprocessing [<xref ref-type="bibr" rid="ref20">20</xref>]. For example, Ziegler et al [<xref ref-type="bibr" rid="ref21">21</xref>] applied reinforcement learning (RL) with human feedback to improve output coherence, while Schulman et al [<xref ref-type="bibr" rid="ref22">22</xref>] proposed the Proximal Policy Optimization (PPO) algorithm for stable policy updates. Pascual et al [<xref ref-type="bibr" rid="ref23">23</xref>] introduced Keyword2Text, a framework that guides text generation based on semantic similarity and topic constraints. These control strategies can enhance performance in tasks such as sequence labeling and sentiment classification.</p><p>Despite these advances, the use of large language models (LLMs) for aspect recognition in German clinical text remains limited. In addition, data privacy regulations restrict the use of commercial cloud-based models in health care environments. To address these challenges, our study investigates the deployment of open-source language models in secure, on-premises settings. We evaluate their performance in aspect recognition using manually annotated German clinical data. Subsequently, we apply sentiment analysis using a fine-tuned BERT model and compare its performance to a German GPT-2 model, which we adapt through supervised learning and reinforcement-based posttraining techniques.</p><p>As can be seen in <xref ref-type="table" rid="table1">Table 1</xref>, prior work has explored sentiment in clinical text [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], or compared ESS to MSLT [<xref ref-type="bibr" rid="ref6">6</xref>], but none have jointly modeled aspect-level sentiment, LLM-based extraction, and alignment between subjective and objective sleep data. Our contributions include:</p><list list-type="order"><list-item><p>First application of aspect-based sentiment analysis to German clinical sleep data for sleep disorders.</p></list-item><list-item><p>Integration of LLM-based aspect extraction (Falcon 40B and Mixtral 8x7B).</p></list-item><list-item><p>Five-level sentiment scoring with normalization.</p></list-item><list-item><p>Quantified misalignment modeling between patient-reported and physiological sleep data.</p></list-item><list-item><p>Identification of misperception profiles at a large scale using real-world clinical records.</p></list-item></list><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of related work and contribution of this study.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study and Method</td><td align="left" valign="bottom">Domain</td><td align="left" valign="bottom">Model type</td><td align="left" valign="bottom">Aspect extraction</td><td align="left" valign="bottom">Sentiment analysis</td><td align="left" valign="bottom">Subjective-objective alignment</td><td align="left" valign="bottom">Sleep focused</td><td align="left" valign="bottom">Contribution notes</td></tr></thead><tbody><tr><td align="left" valign="top">Denecke and Deng [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">Clinical (general)</td><td align="left" valign="top">Rule-based+ML<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">Limited</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">Early sentiment methods in clinical text</td></tr><tr><td align="left" valign="top">Aurora et al [<xref ref-type="bibr" rid="ref6">6</xref>]</td><td align="left" valign="top">Sleep disorders</td><td align="left" valign="top">Statistical correlation</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713; (sleepiness)</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">Compared subjective and objective sleepiness (ESS<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> vs MSLT)<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Hermans et al [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">Sleep misperception</td><td align="left" valign="top">EEG<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> analysis</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">Focus on objective EEG markers of misperception</td></tr><tr><td align="left" valign="top">Denecke and Reichenpfader [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">Clinical NLP<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="top">Transformer (RoBERTa)</td><td align="left" valign="top">&#x2713; (limited)</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">Survey of clinical sentiment tools</td></tr><tr><td align="left" valign="top">Ziegler et al [<xref ref-type="bibr" rid="ref21">21</xref>]</td><td align="left" valign="top">General NLP</td><td align="left" valign="top">GPT-2+RLHF</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713; (general)</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">Introduce RL-based<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup> alignment for subjective text</td></tr><tr><td align="left" valign="top">Our study</td><td align="left" valign="top">Sleep disorder</td><td align="left" valign="top">BiLSTM+RoBERTa/GPT-2+RL</td><td align="left" valign="top">&#x2713; (LLM-extracted)<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td><td align="left" valign="top">&#x2713; (5-stage scale)</td><td align="left" valign="top">&#x2713; (quantified misalignment)</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">Joint subjective-objective modeling, LLM-aspect extraction, scoring, feedback mining</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ML: machine learning.</p></fn><fn id="table1fn2"><p><sup>b</sup>ESS: Epworth Sleepiness Scale.</p></fn><fn id="table1fn3"><p><sup>c</sup>MSLT: Multiple Sleep Latency Test.</p></fn><fn id="table1fn4"><p><sup>d</sup>EEG: electroencephalogram.</p></fn><fn id="table1fn5"><p><sup>e</sup>NLP: natural language processing.</p></fn><fn id="table1fn6"><p><sup>f</sup>RL: reinforcement learning.</p></fn><fn id="table1fn7"><p><sup>g</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>The developed pipeline for aspect-oriented sentiment analysis is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The single steps are described together with the dataset in the following. Our approach involves extracting subjective perceptions of sleep quality, daytime sleepiness, and daily fatigue from the clinical documentation. Specifically, we consider the documented patient history descriptions and the reports related to the PSG examination.</p><p>Concurrently, objective benchmarks will be gathered from numerical results in objective sources, including MSLT, MWT (Maintenance of Wakefulness Test), P-AW (Actigraphy Wrist), and parts of the PSG report.</p><p>In a first step, entities and phrases referring to our 3 aspects of interest (day sleepiness, sleep quality, and fatigue) are extracted. Once these aspects are identi&#xFB01;ed, we classify them according to the sentiment expressed for each aspect separately. The next step involves calibrating the objective numerical assessments to match the subjective sentiment scores, allowing for a direct comparison between subjective and objective data. Finally, by evaluating these 2 sets of sentiment values, we statistically analyze the extent of discrepancies in the diagnosis of sleep disorders, providing insights into how patient perceptions align with clinical assessments.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of sentiment analysis in sleep disorders. LLM: large language model; MSLT: Multiple Sleep Latency Test; MWT: Maintenance of Wakefulness Test; P-AW: Actigraphy Wrist; PPO: Proximal Policy Optimization; PSG: polysomnography.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e70753_fig01.png"/></fig></sec><sec id="s2-2"><title>Dataset</title><p>Our dataset originates from inpatient records created at the sleep laboratory of the University Hospital Bern, covering treatments between 2000 and 2021. This collection is part of the Bern Sleep Registry [<xref ref-type="bibr" rid="ref25">25</xref>]. Its secondary use received approval from the Cantonal Ethics Committee (KEK-Nr. 2022&#x2010;00415: &#x201C;Bern Sleep Registry: the sleep disorder patient cohort of the Inselspital, University Hospital Bern&#x201D;).</p><p>The database comprises German-language coded clinical records of patients diagnosed with sleep disorders. Over 10,000 of these records have been retrospectively and meticulously categorized by physicians specialized in sleep disorders based on the <italic>International Classi&#xFB01;cation of Sleep Disorders, Third Edition</italic> (<italic>ICSD-III</italic>). <xref ref-type="table" rid="table2">Table 2</xref> shows the types of text documents that are considered from this database for this paper.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>The sentiment analysis mainly focuses on the subjective text (second column), while the objective value extractions are related to test results. Sample sizes reflect prevalence in clinical notes; fatigue records are rarer than sleep quality.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Aspect</td><td align="left" valign="bottom">Subjective data sources</td><td align="left" valign="bottom">Information sources and documents</td><td align="left" valign="bottom">Objective data sources</td><td align="left" valign="bottom">Statistics</td></tr></thead><tbody><tr><td align="left" valign="top">Daytime sleepiness (129 records)</td><td align="left" valign="top">History: Daytime sleepiness, Epworth Sleepiness Scale, MSLT<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> /MWT<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> KSS<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">History, Multiple Sleep Latency Test, Maintenance of Wakefulness Test</td><td align="left" valign="top">MSLT and MWT: eg, &#x201C;In summary, a mild daytime sleepiness can be objecti&#xFB01;ed,&#x201D; vigilance test results</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>36 (28%) of records show SOL-KSS<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> discrepancy</p></list-item><list-item><p>ESS<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> &#x003E;10 correlates with MSLT latency &#x003C;8 minutes (<italic>r</italic>=&#x2013;0.72)</p></list-item><list-item><p>"Mild daytime sleepiness" documented in 58 (63%) of 92 cases with MSLT 5-8 min</p></list-item></list></td></tr><tr><td align="left" valign="top">Fatigue exhaustion depressive symptoms (450 records)</td><td align="left" valign="top">History: Fatigue, Fatigue Severity Scale, BDI-II<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">Hist<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup>, P-AW<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top">Increased inactivity component</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>FSS<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup> &#x003E;5 correlates with BDI-II &#x003E;14 (r=0.68)</p></list-item><list-item><p>189 (42%) show activity reduction</p></list-item><list-item><p>"Increased inactivity" noted in 112 (89%) of 126 severe fatigue cases</p></list-item></list></td></tr><tr><td align="left" valign="top">Sleep quality (3815 records)</td><td align="left" valign="top">Hist: Sleep quality, Insomnia PSG<sup><xref ref-type="table-fn" rid="table2fn10">j</xref></sup> Report: Estimated sleep onset Latency and sleep duration</td><td align="left" valign="top">Hist, PSG, P-AW</td><td align="left" valign="top">PSG Report:<break/>Objective sleep onset Latency and sleep duration, sleep efficiency (&#x00A1; 80%: reduced). Actigraphy Report: Actimetric sleep efficiency</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>2557 (67%) underestimate TST<sup><xref ref-type="table-fn" rid="table2fn11">k</xref></sup> (mean diff: &#x2013;73 min)</p></list-item><list-item><p>SE &#x003C;80% in 82%<sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup> of insomnia diagnosis</p></list-item><list-item><p>"Reduced efficiency" documented in 94%<sup><xref ref-type="table-fn" rid="table2fn13">m</xref></sup> of SE&#x003C;75% cases</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>MSLT: Multiple Sleep Latency Test.</p></fn><fn id="table2fn2"><p><sup>b</sup>MWT: Maintenance of Wakefulness Test.</p></fn><fn id="table2fn3"><p><sup>c</sup>KSS: Karolinska Sleepiness Scale.</p></fn><fn id="table2fn4"><p><sup>d</sup>SOL-KSS: Sleep Onset Latency Karolinska Sleepiness Scale</p></fn><fn id="table2fn5"><p><sup>e</sup>ESS: Epworth Sleepiness Scale.</p></fn><fn id="table2fn6"><p><sup>f</sup>BDI-II: Beck Depression Inventory II.</p></fn><fn id="table2fn7"><p><sup>g</sup>Hist: patient history.</p></fn><fn id="table2fn8"><p><sup>h</sup> P-AW: Actigraphy Wrist.</p></fn><fn id="table2fn9"><p><sup>i</sup>FSS: Fatigue Severity Scale.</p></fn><fn id="table2fn10"><p><sup>j</sup>PSG: polysomnography.</p></fn><fn id="table2fn11"><p><sup>k</sup>TST: total sleep time.</p></fn><fn id="table2fn12"><p><sup>l</sup>Absolute number not available; based on weighted insomnia subset</p></fn><fn id="table2fn13"><p><sup>m</sup>Absolute counts unavailable due to conditional subgroup definitions.</p></fn></table-wrap-foot></table-wrap><p>For various validations, we have prepared the following manually annotated datasets to validate and &#xFB01;ne-tune the pretrained model:</p><list list-type="order"><list-item><p>Aspect extraction benchmark: 150 sentences manually labeled to validate entity and aspect extraction performance.</p></list-item><list-item><p>Sentiment analysis corpus: 2000 sentences annotated on a 5-point sentiment scale using on-premises LLMs (Falcon 40B [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>] and Mixtral 8x7B [<xref ref-type="bibr" rid="ref28">28</xref>]), with human-in-the-loop correction.</p></list-item></list><p>The 2000 sentences were sampled from clinical documents belonging to 100 unique patients, spanning multiple sleep disorder categories. These sentences were drawn proportionally from patient histories, diagnostic summaries, and follow-up reports to ensure a representative mix across conditions and report types. Stratified sampling was used to balance diagnoses, age groups, and sentiment distribution. To avoid patient-level bias, no more than 10 sentences per patient were included in the annotated set.</p><p>To ensure robust analysis of subjective-objective alignment in daytime sleepiness, we included only those patients who had complete paired data for the following instruments: Karolinska Sleepiness Scale (KSS)&#x2014;subjective measure, MSLT&#x2014;objective physiological measure, and MWT&#x2014;behavioral sleepiness measure. This resulted in a final cohort of 100 patients with complete records across all 3 modalities, allowing for valid computation of discrepancy and misperception scores. For other dimensions, such as fatigue and sleep quality, which currently lack standardized objective clinical benchmarks, we retained a broader set of records (eg, 3815 entries for sleep quality) to support exploratory sentiment-based analysis.</p></sec><sec id="s2-3"><title>Aspect and Sentiment Distribution</title><p>To analyze sentiment by aspect, we annotated 50 representative sentences per aspect totaling 150 sentences. <xref ref-type="table" rid="table3">Table 3</xref> summarizes the frequency of extracted entity groups by category, while <xref ref-type="table" rid="table4">Table 4</xref> presents the sentiment annotations and related statistics.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Aspect benchmark statistics regarding sleep quality, daytime sleepiness, fatigue, and the entire corpus.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Aspects</td><td align="left" valign="bottom">List of entity groups</td><td align="left" valign="bottom">Entity count</td></tr></thead><tbody><tr><td align="left" valign="top">Sleep quality (50 sentences)</td><td align="left" valign="top">Sleep quality, Latency, Duration, Estimation</td><td align="left" valign="top">75</td></tr><tr><td align="left" valign="top">Daytime sleepiness (50 sentences)</td><td align="left" valign="top">Day sleepiness, Latency, PVT (Vigilance test), ESS<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>/KSS<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">35</td></tr><tr><td align="left" valign="top">Fatigue (50 sentences)</td><td align="left" valign="top">Fatigue Severity Scale, Measurements BDI-II<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">77</td></tr><tr><td align="left" valign="top">Entire benchmark (150 sentences)</td><td align="left" valign="top">Symptoms, Diseases, Complaints, Feedback, Measurements</td><td align="left" valign="top">119</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ESS: Epworth Sleepiness Scale.</p></fn><fn id="table3fn2"><p><sup>b</sup>KSS: Karolinska Sleepiness Scale.</p></fn><fn id="table3fn3"><p><sup>c</sup>BDI-II: Beck Depression Inventory II.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Sentiment statistics regarding sleep quality, daytime sleepiness, fatigue, and the entire corpus.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Sentiment class</td><td align="left" valign="bottom">Count</td><td align="left" valign="bottom">Percentage</td><td align="left" valign="bottom">Average sentence length</td><td align="left" valign="bottom">Lexical characteristics</td></tr></thead><tbody><tr><td align="left" valign="top">Very Negative</td><td align="left" valign="top">180</td><td align="left" valign="top">9%</td><td align="left" valign="top">22.4 (5.8) words</td><td align="left" valign="top">Sleep quality: "severe insomnia," "non-restorative sleep," "fragmented sleep all night"<break/>Fatigue: "debilitating exhaustion," "unable to function"<break/>Sleepiness: "dangerous sleep attacks," "uncontrollable drowsiness"</td></tr><tr><td align="left" valign="top">Negative</td><td align="left" valign="top">680</td><td align="left" valign="top">34%</td><td align="left" valign="top">18.6 (4.2) words</td><td align="left" valign="top">Sleep quality: "frequent awakenings," "prolonged sleep latency," "restless sleep"<break/>Fatigue: "persistent tiredness," "low energy throughout day"<break/>Sleepiness: "excessive daytime sleepiness," "struggling to stay awake"</td></tr><tr><td align="left" valign="top">Neutral</td><td align="left" valign="top">920</td><td align="left" valign="top">46%</td><td align="left" valign="top">14.2 (3.1) words</td><td align="left" valign="top">Sleep quality: "TST 6.2 hours," "sleep efficiency 82%," "PSG: 4 REM cycles"<break/>Fatigue: "reports moderate fatigue," "FSS score: 4.2"<break/>Sleepiness: "ESS: 12," "MSLT mean latency: 8.3 min"</td></tr><tr><td align="left" valign="top">Positive</td><td align="left" valign="top">180</td><td align="left" valign="top">9%</td><td align="left" valign="top">16.8 (3.9) words</td><td align="left" valign="top">Sleep quality: "improved sleep continuity," "satisfactory sleep duration"<break/>Fatigue: "reduced fatigue levels," "manageable tiredness"<break/>Sleepiness: "mild daytime sleepiness," "occasional drowsiness"</td></tr><tr><td align="left" valign="top">Very Positive</td><td align="left" valign="top">40</td><td align="left" valign="top">2%</td><td align="left" valign="top">19.3 (4.7) words</td><td align="left" valign="top">Sleep quality: "excellent sleep quality," "fully restorative sleep"<break/>Fatigue: "complete resolution of fatigue," "sustained energy"<break/>Sleepiness: "full alertness," "no daytime sleepiness"</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">2000</td><td align="left" valign="top">100%</td><td align="left" valign="top">16.8 (4.6) words</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>TST: total sleep time.</p></fn><fn id="table4fn2"><p><sup>b</sup>PSG: polysomnography</p></fn><fn id="table4fn3"><p><sup>c</sup>REM: rapid eye movement.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-4"><title>Mathematical Description of Aspect-Based Sentiment Analysis for Sleep Disorder</title><p>In the following, we define the dataset and tasks considered in this paper.</p><p>Dataset Definition</p><p>Let the dataset be defined as:</p><p><italic>D</italic>={(<italic>x<sub>i</sub></italic>, <italic>A<sub>i</sub></italic>, <italic>y<sub>i</sub></italic>)|=1,...,<italic>N</italic></p><p>Where:</p><list list-type="bullet"><list-item><p><italic>x<sub>i</sub></italic> is the tokenized and embedded representation of sentence <italic>i</italic>, obtained via BERT or RoBERTa (shape: <italic>T&#x00D7;d</italic>, where <italic>T</italic>=tokens, <italic>d</italic>=embedding dimension).</p></list-item><list-item><p><italic>A<sub>i</sub></italic> is the set of extracted aspect terms from <italic>x</italic><sub><italic>i</italic></sub> using LLMs (eg, Falcon 40B and Mixtral 8x7B).</p></list-item><list-item><p><italic>y<sub>i</sub></italic> is the discrete sentiment label &#x2208; {0, 1, 2, 3, 4} representing 5-stage sentiment levels.</p></list-item><list-item><p>The normalized score is:</p><p><inline-formula><mml:math id="ieqn1"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mn>4</mml:mn></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>So <italic>S<sub>i</sub></italic>&#x2208;[0,1]</p></list-item></list><p>Aspect Extraction via LLMs</p><p>For each input sentence <italic>x<sub>i</sub></italic> and aspect category, for example, <italic>c</italic> &#x2208; C_Categories, we define a prompt <italic>P</italic><sub><italic>c</italic></sub> and extract:</p><p><italic>a<sub>i</sub></italic><sup>(</sup><italic><sup>c</sup></italic><sup>)=<italic>LLM</italic> (<italic>P<sub>c</sub>,x<sub>i</sub></italic>)</sup> for each aspect category c</p><p>The union of these yields the aspect set:</p><p><italic>A<sub>i</sub></italic>=<italic>U<sub>c</sub>a<sub>i</sub></italic><sup>(</sup><italic><sup>c</sup></italic><sup>)</sup></p><p>To align with canonical medical terms, compute cosine similarity:</p><p>sim(e, c) = cosine(embedding(e), embedding(c))</p><p>Accept the match if sim&#x2265;threshold</p><p>Sentiment Classification (BiLSTM+Self-Attention)</p><list list-type="order"><list-item><p>Input embeddings</p><p>Each <italic>x</italic><sub><italic>i</italic></sub><italic>R</italic> is passed to a bidirectional LSTM:</p><p><italic>h</italic><sub><italic>i</italic></sub><italic>=BiLSTM (x</italic><sub><italic>i</italic></sub><italic>)&#x2208;R</italic><sup><italic>T&#x00D7;h</italic></sup></p></list-item><list-item><p>Sequential feature aggregation (concatenation or pooling):</p><p><italic>f</italic><sub><italic>i</italic></sub><italic>=Self &#x2013; Attention (h</italic><sub><italic>i</italic></sub><italic>)&#x2208;R</italic><sup><italic>h&#x2032;</italic></sup></p></list-item><list-item><p>Classification layer (with softmax over 5 sentiment classes): Using a linear layer with parameters <italic>W</italic>, <italic>b</italic>:</p><p><inline-formula><mml:math id="ieqn2"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>W</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> with <inline-formula><mml:math id="ieqn3"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mi>g</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></p><p>Where:</p></list-item></list><list list-type="bullet"><list-item><p><italic>W</italic>&#x2208;<italic>R</italic><sup>5&#x00D7;<italic>h</italic>&#x2032;</sup>, <italic>b</italic>&#x2208;<italic>R</italic><sup>5</sup>are learnable parameters,</p></list-item><list-item><p><italic>p<sub>i</sub></italic>&#x2208;<italic>R</italic><sup>5</sup> is the predicted class distribution.</p></list-item></list><p>Training Objective (Supervised Fine-Tuning: L1)</p><p>Use categorical cross-entropy for supervision:</p><disp-formula id="equWL4"><mml:math id="eqn1"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>5</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mn>1</mml:mn><mml:mfenced open="[" close="]" separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:mfenced><mml:mo>&#x2219;</mml:mo></mml:mrow></mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>k</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></disp-formula><p>Sentiment Control via GPT-2 (L2: Reinforcement Learning)[<xref ref-type="bibr" rid="ref29">29</xref>]</p><p>After initial fine-tuning, a GPT-2 model is trained to modify or generate text while controlling for sentiment class y<sub>i</sub></p><p>Logit Biasing /Reward Shaping: to</p><disp-formula id="E10"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>L</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mrow><mml:mover><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>r</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mover><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Where <inline-formula><mml:math id="ieqn4"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>r</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mover><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is a reward signal based on:</p><list list-type="bullet"><list-item><p>Keyword inclusion (aspect control),</p></list-item><list-item><p>Matching predicted sentiment from a fixed classifier with <italic>y<sub>i</sub></italic></p></list-item></list><p>The full objective (SFT+RL) becomes:</p><disp-formula id="equWL8"><italic>L<sub>total</sub></italic>=<italic>L<sub>CE</sub></italic>+&#x03BB;&#x00B7;<italic>L<sub>RL</sub></italic></disp-formula><p>where &#x03BB;&#x2208;[0,1] is a hyperparameter controlling the influence of reinforcement-based alignment.</p><p>Normalization of Subjective and Objective Measures</p><p>Min-max normalization for raw clinical scores <italic>r</italic> over scale [<italic>S<sub>min</sub></italic>, <italic>S<sub>max</sub></italic>]:</p><disp-formula id="equWL9"><mml:math id="eqn4"><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:math></disp-formula><p>Examples:</p><list list-type="bullet"><list-item><p>Epworth Sleepiness Scale (0&#x2010;24):</p><p><inline-formula><mml:math id="ieqn5"><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>24</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mn>24</mml:mn></mml:mrow></mml:mfrac></mml:math></inline-formula></p></list-item><list-item><p>Fatigue Severity Scale (1-7):<inline-formula><mml:math id="ieqn6"><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>7</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mn>6</mml:mn></mml:mrow></mml:mfrac></mml:math></inline-formula></p></list-item><list-item><p>Beck Depression Inventory II (0&#x2010;63)</p><p><inline-formula><mml:math id="ieqn7"><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>63</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mn>63</mml:mn></mml:mrow></mml:mfrac></mml:math></inline-formula></p></list-item><list-item><p>Karolinska Sleepiness Scale (KSS) (1-9):</p><p><inline-formula><mml:math id="ieqn8"><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>9</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>K</mml:mi><mml:mi>S</mml:mi><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mn>9</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mfrac></mml:math></inline-formula></p></list-item></list></sec><sec id="s2-5"><title>Aspect Extraction From Diagnostic Text</title><p>For aspect extraction, we define 4 distinct groups of clinical entities based on their frequency and contextual relevance, with the aim of analyzing discrepancies between subjective reports and objective clinical findings (see <xref ref-type="table" rid="table2">Table 2</xref>). While aggregating sentiment scores can offer a general view of polarity across different textual spans (eg, sentences or paragraphs), they are insufficient for capturing detailed sentiment variations related to specific medical aspects such as symptoms, complaints, diagnoses, or co-occurring conditions. To address this limitation, we implemented a fine-grained entity extraction approach that combines open-source LLMs with standard clinical entity recognition techniques.</p><p>Specifically, we use 2 self-hosted models: Falcon 40B [<xref ref-type="bibr" rid="ref27">27</xref>] and Mixtral 8x7B [<xref ref-type="bibr" rid="ref30">30</xref>]. Falcon 40B is a dense, decoder-only language model designed for a broad range of NLP tasks. Mixtral 8x7B, by contrast, follows a sparse mixture-of-experts architecture, activating 2 expert modules per token, thereby achieving a favorable balance between performance and computational efficiency. Despite having 46.7 billion parameters in total, only a subset is used at each step, making it suitable for resource-constrained environments. We evaluated Falcon 40B as a baseline but used Mixtral 8x7B for all final extractions reported here.</p><p>Aspect-level sentiment scoring is achieved by linking the extracted entities to their corresponding sentiment values within a given sentence. This allows us to compute sentiment scores that are specific to individual clinical aspects. The entity extraction process is carried out in 2 stages: (Stage 1) Aspect identification using prompt-based querying with the locally hosted Mixtral 8x7B model. Separate prompts are constructed for each category&#x2014;symptoms, complaints, diagnoses, and patient feedback&#x2014;to ensure focused and interpretable extraction.</p><p>To extract patient feedback entities, we design targeted natural language prompts structured to guide the model toward recognizing relevant subjective expressions. Below is a representative prompt used with the Falcon 40B model:</p><sec id="s2-5-1"><title>Prompt</title><p>You are a helpful clinical entity extractor. Given the following medical note, please extract any patient feedback or evaluative statements related to their condition, treatment experience, or overall well-being.</p><p>Text: &#x201C;[Insert clinical sentence here]&#x201D;</p><p>Output format: [feedback_1, feedback_2, ...]</p><p>The given prompt for entity extraction from clinical patient data can be efficiently reused by systematically substituting the target category in the text. By iteratively inserting each category (eg, &#x201C;symptoms,&#x201D; &#x201C;diseases,&#x201D; &#x201C;complaints,&#x201D; and &#x201C;feedback&#x201D;) into the prompt, the same prompt structure can be applied multiple times to extract entities for one category at a time. This approach allows looping over the list of aspect categories and generating a valid JSON object for each without needing multiple distinct prompts. It ensures structured and repeatable extraction across all defined aspects.</p><p>To improve the identification of measurement-related entities in German clinical texts, which are often missed by LLMs due to domain-specific phrasing in German clinical texts, we apply a semantic similarity approach based on sentence embeddings. This method is more robust to morphological variation, word order, and paraphrasing than traditional string metrics such as Levenshtein distance.</p><p>We used German-specific sentence embeddings generated by the sentence-transformers library with the multilingual model distiluse-base-multilingual-cased-v1, which supports high-quality semantic representations of German phrases.</p></sec><sec id="s2-5-2"><title>Matching Procedure</title><p>To identify and align relevant sleep-related entities from clinical narratives, we implemented a multistep matching procedure combining lexical, semantic, and synonym-based techniques, as detailed below:</p><list list-type="order"><list-item><p>Entity candidate generation: Candidate phrases were extracted from clinical documents using regex-based pattern matching and contextual heuristics (eg, token windows around sleep-related terms).</p></list-item><list-item><p>Embedding computation: Sentence embeddings were computed for each candidate phrase and compared to embeddings of canonical terms from our curated sleep measurement lexicon.</p></list-item><list-item><p>Similarity scoring: Cosine similarity was used to measure semantic closeness between candidate and canonical embeddings. A similarity threshold of 0.83 was used to determine a valid match based on manual validation (see <xref ref-type="table" rid="table5">Table 5</xref>).</p></list-item><list-item><p>Synonym expansion: A manually defined dictionary of common medical synonyms (eg, &#x201C;Schlafqualit&#x00E4;t&#x201D; &#x2248; &#x201C;Qualit&#x00E4;t des Schlafes&#x201D;) was integrated into the matching logic to further boost recall.</p></list-item></list><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Embedding-based semantic matching for German sleep disorder terms.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Canonical term</td><td align="left" valign="bottom">Variant detected in text</td><td align="left" valign="bottom">Cosine similarity</td><td align="left" valign="bottom">Matched?</td><td align="left" valign="bottom">English translation</td></tr></thead><tbody><tr><td align="left" valign="top">Schlafqualit&#x00E4;t</td><td align="left" valign="top">Qualit&#x00E4;t des Schlafes</td><td align="char" char="." valign="top">0.86</td><td align="left" valign="top">True</td><td align="left" valign="top">Sleep quality</td></tr><tr><td align="left" valign="top">Schlaflatenz</td><td align="left" valign="top">Zeit bis zum Einschlafen</td><td align="char" char="." valign="top">0.85</td><td align="left" valign="top">True</td><td align="left" valign="top">Time to fall asleep</td></tr><tr><td align="left" valign="top">REM-Schlafanteil<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">REM Schlafphasen Dauer</td><td align="char" char="." valign="top">0.81</td><td align="left" valign="top">False</td><td align="left" valign="top">Duration of REM sleep phases</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>REM: rapid eye movement. </p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s2-6"><title>Objective Measure Calibration</title><p>To enable comparability across heterogeneous clinical measurements and support sentiment interpretation, we apply min-max normalization to both objective and subjective metrics. This ensures all scores lie within the interval [0, 1], where 0 indicates maximal symptom burden and 1 indicates no impairment. This transformation provides a unified scale for downstream sentiment mapping.</p><p>For objective physiological measures such as PSG-recorded sleep latency, we applied the following normalization: Normalized Latency Score=1&#x2013;(Raw Value/Max Latency) where the clinically derived maximum latency was set to 120 minutes, in accordance with standard clinical PSG protocols. This formulation ensures shorter latency (indicative of better sleep initiation) corresponds to higher normalized scores.</p><sec id="s2-6-1"><title>Subjective scale normalization</title><p>For subjective clinical questionnaires (see <xref ref-type="table" rid="table6">Table 6</xref>), we used min-max normalization based on each scale&#x2019;s full range. The transformation ensures that higher symptom burden maps to lower normalized values:</p><p>Normalized Score=(Max Score &#x2013; Raw Score) /(Max Score &#x2013; Min Score)</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Normalization mappings between the objective score.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Instrument</td><td align="left" valign="bottom">Score range</td><td align="left" valign="bottom">Normalization formula</td></tr></thead><tbody><tr><td align="left" valign="top">Karolinska Sleepiness Scale (KSS)</td><td align="char" char="." valign="top">1&#x2010;9</td><td align="left" valign="top">(9 &#x2013; Raw Score) / (9 &#x2013; 1)</td></tr><tr><td align="left" valign="top">Epworth Sleepiness Scale (ESS)</td><td align="char" char="." valign="top">0&#x2010;24</td><td align="left" valign="top">(24 &#x2013; Raw Score) / (24 &#x2013; 0)</td></tr><tr><td align="left" valign="top">Fatigue Severity Scale (FSS)</td><td align="char" char="." valign="top">1&#x2010;7</td><td align="left" valign="top">(7 &#x2013; Raw Score) / (7 &#x2013; 1)</td></tr><tr><td align="left" valign="top">Beck Depression Inventory II (BDI-II)</td><td align="char" char="." valign="top">0&#x2010;63</td><td align="left" valign="top">(63 &#x2013; Raw Score) / (63 &#x2013; 0)</td></tr></tbody></table></table-wrap><p>This ensures that 0 corresponds to the maximum raw score (most negative sentiment), and 1 corresponds to the minimum raw score (most positive sentiment).</p></sec></sec><sec id="s2-7"><title>Architecture for Clinical Sentiment Analysis</title><p>We deploy a BERT-based BiLSTM architecture to perform sentence-level sentiment classification across 5 polarity levels (see <xref ref-type="fig" rid="figure2">Figure 2</xref>). The model ingests token sequences up to 250 tokens, with BERT embeddings (768 dimensions) [<xref ref-type="bibr" rid="ref31">31</xref>] providing context-aware representations.</p><p>We fine-tune both multilingual BERT and XLM-RoBERTa [<xref ref-type="bibr" rid="ref32">32</xref>] on a corpus of 1200 German clinical sentences annotated with 5-stage sentiment labels, derived from Mixtral 8X7B prompting and manual correction (interannotator agreement &#x003E;89%).</p><p>Negated expressions in German are normalized into unified tokens prior to BERT encoding to stabilize polarity signals. The resulting embeddings are processed by a BiLSTM layer followed by sequential feature aggregation. The output logits are mapped to 5 sentiment classes, which are rescaled to a [0, 1] range:</p><p>0 (very negative), 1 (negative), 2 (neutral), 3 (positive), 4 (very positive)</p><p>The model is trained using a 60/20/20 split across training (1200), validation (400), and test (400) sets.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>BiLSTM based on BERT language modeling for clinical sentiment analysis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e70753_fig02.png"/></fig><sec id="s2-7-1"><title>Feature Fusion and Context Learning</title><p>The architecture incorporates multiple contextual sources:</p><p><italic>F<sub>Mask</sub></italic>: local contextual features via dynamic masking</p><p><italic>F<sub>weighting</sub></italic>: Dynamically weighted local embeddings</p><p><italic>F<sub>disorder</sub></italic>: global BiLSTM features</p><p>The 5 stages of sentiment resulting from with or without BERT alignment and &#xFB01;ne-tuning will be evaluated on these benchmarks.</p></sec><sec id="s2-7-2"><title>Feature Ensemble and Context Learning for BERT BiLSTM</title><p>During the uni&#xFB01;ed contextual feature assembling process for sleep disorder topics, both local and global contextual features derived from the contextual feature dynamic mask technique, the dynamic weighting mechanism, and the BiLSTM layers are integrated. In addition, features enriched by LLMs are incorporated. Speci&#xFB01;cally, the learned local and global contextual features are concatenated to produce a uni&#xFB01;ed contextual feature output. This procedure is mathematically represented as follows:</p><disp-formula id="equWL10"><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>m</mml:mi><mml:mi>b</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x2A01;</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>w</mml:mi><mml:mi>e</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x2A01;</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>In this equation, <inline-formula><mml:math id="ieqn9"><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>L</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> represents the set of local contextual embedding vectors computed by the dynamic mask technique, while <inline-formula><mml:math id="ieqn10"><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>w</mml:mi><mml:mi>e</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>L</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> denotes the set of local contextual embedding vectors examined through the dynamic weighting mechanism. <italic>F</italic><sub><italic>disorder</italic></sub> aspect refers to the set of global contextual embedding vectors captured via BiLSTM layers. The operator indicates the concatenation of these feature sets. These fused features are further processed through BiLSTM to capture sequential dependencies and then transmitted into Feedforward Network with Softmax to perform &#xFB01;ne-grained aspect-based sentiment classi&#xFB01;cation.</p></sec><sec id="s2-7-3"><title>Network Con&#xFB01;guration and Hyperparameters for BERT BiLSTM</title><p>The hyperparameter <xref ref-type="table" rid="table7">Table 7</xref> outlines the con&#xFB01;guration of the network, which operates in 2 main stages. In the &#xFB01;rst stage, aspect recognition is performed using the initial set of network parameters. The recognized aspects are then incorporated into the second stage through feature fusion, where they are linearly combined and pooled together with position features. This fused feature set is subsequently applied and jointly optimized within the &#xFB01;ne-tuned RoBERTa model&#x2019;s loss function. The optimization is carried out using a 5-stage polarity classi&#xFB01;cation, ensuring that both aspect-level insights and positional information are effectively used for enhanced performance.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Model hyperparameters for &#xFB01;ne-tuning of BERT and RoBERTa BiLSTM multilingual.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameter</td><td align="left" valign="bottom">Value</td></tr></thead><tbody><tr><td align="left" valign="top">Embedding dimension</td><td align="left" valign="top">768</td></tr><tr><td align="left" valign="top">Transformer encoder</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">Attention head</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">Optimizer</td><td align="left" valign="top">Adam<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Learning rate</td><td align="left" valign="top">5e-5</td></tr><tr><td align="left" valign="top">Epoch</td><td align="left" valign="top">50</td></tr><tr><td align="left" valign="top">Dropout rate</td><td align="left" valign="top">0.2</td></tr><tr><td align="left" valign="top">Batch size</td><td align="left" valign="top">32</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>Adam: adaptive moment estimation is a stochastic optimization algorithm.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-7-4"><title>Finetuned German GPT-2 With Supervised Fine-Tuning and Logit Modi&#xFB01;cation</title><p>In addition to our BERT-BiLSTM sentiment classification baseline, we fine-tune a German GPT-2 model [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>] to generate text reflecting 5 clinically relevant stages of sentiment in sleep disorder narratives. As illustrated in <xref ref-type="fig" rid="figure3">Figure 3</xref>, our 2-stage fine-tuning approach involves both supervised learning (L1) and reinforcement learning (L2) to align the model&#x2019;s generations with targeted sentiment levels. In the first stage (L1), we perform supervised fine-tuning (SFT) using the Hugging Face Transformer Reinforcement Learning (TRL) framework [<xref ref-type="bibr" rid="ref28">28</xref>] on a corpus of 1200 annotated German clinical texts (<xref ref-type="table" rid="table8">Table 8</xref>), each labeled with 1 of 5 sentiment classes: strong negative, minor negative, neutral, minor positive, and strong positive. The SFT phase runs for 3 epochs with a learning rate of 5e-5, a batch size of 16, and a maximum sequence length of 1024 tokens, using standard language modeling loss to adapt the pretrained GPT-2 model to the clinical sleep domain and sentiment-specific instruction format. Further details regarding the implementation are available in the GitHub repository [<xref ref-type="bibr" rid="ref35">35</xref>].</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Fine-tuned German GPT-2 followed by supervised &#xFB01;ne-tuning, reinforcement learning, and logit modi&#xFB01;cation. *RL indicates the steps of reinforcement learning RL1 and RL2. RL: reinforcement learning; SFT: supervised fine-tuning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e70753_fig03.png"/></fig><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Model hyperparameters for GPT-2 supervised fine-tuning.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameter</td><td align="left" valign="bottom">Value</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-2 sentiment classi&#xFB01;er learning rate</td><td align="left" valign="top">2e-5</td></tr><tr><td align="left" valign="top">GPT-2 sentiment classi&#xFB01;er batch size</td><td align="left" valign="top">16</td></tr><tr><td align="left" valign="top">SFT trainer learning rate</td><td align="left" valign="top">5e-5</td></tr><tr><td align="left" valign="top">SFT training epochs</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">SFT maximum sequence length</td><td align="left" valign="top">1024</td></tr><tr><td align="left" valign="top">RL<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup> generation kwargs top-k</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">RL generation kwargs top-p</td><td align="left" valign="top">0.5</td></tr><tr><td align="left" valign="top">Maximum new tokens</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top">RL PPO<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup> mini-batch size</td><td align="left" valign="top">16</td></tr><tr><td align="left" valign="top">RL PPO con&#xFB01;guration steps</td><td align="left" valign="top">41,000</td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>RL: reinforcement learning.</p></fn><fn id="table8fn2"><p><sup>b</sup>PPO: Proximal Policy Optimization.</p></fn></table-wrap-foot></table-wrap><p>The second stage (L2) consists of RL via PPO [<xref ref-type="bibr" rid="ref33">33</xref>], using the sentiment classifier as a reward model. Here, the model generates responses conditioned on prompts and randomly sampled target sentiment tokens. The classifier evaluates each generation and assigns a reward signal based on the logit of the target sentiment class, encouraging the model to reinforce sentiment-accurate responses. This process is iteratively optimized over 41,000 PPO steps, using a PPO mini-batch size of 16, and generation hyperparameters including top-k=3, top-p=0.5, and a maximum of 100 new tokens per sample.</p><p>We designate these PPO training phases as RL1 and RL2, which are now clearly labeled in <xref ref-type="fig" rid="figure1">Figures 1</xref> and <xref ref-type="fig" rid="figure3">3</xref> *RL. Each RL phase refines the generator&#x2019;s ability to follow sentiment conditioning with increasing precision. This logit-based reward shaping strategy ensures granular control over sentiment realization in the output. Our evaluation metric during PPO optimization is the accuracy of generated sentiment, as classified by the same frozen classifier used for reward computation. The combination of supervised preadaptation and reward-driven fine-tuning allows our GPT-2 model to generate sentiment-aligned narratives that preserve clinical plausibility while reflecting emotional tone variations essential for downstream affective or patient-centered NLP tasks.</p></sec><sec id="s2-7-5"><title>Derived Subjective Sleepiness Scoring and Comparison to Objective Measures</title><sec id="s2-7-5-1"><title>Subjective Sleepiness Approach</title><p>To robustly assess subjective sleepiness, we combined inputs from the KSS and clinician-documented descriptions to produce a harmonized subjective score. KSS ratings were first converted to a 5-stage ordinal scale (0&#x2010;4), with thresholds mapped as follows: 1&#x2010;2=4 (very severe sleepiness),</p><p>3&#x2010;4=3, 5&#x2010;6=2, 7&#x2010;8=1, 9=0 (no sleepiness).</p><p>This reverse scoring reflects the interpretation that lower KSS values indicate greater momentary sleepiness.</p><p>In parallel, clinical text segments were processed using a sentiment analysis pipeline based on a fine-tuned GPT-2 model trained with RL to optimize accuracy across 5 sentiment stages. This model provided a text-derived sleepiness score on the same 0&#x2010;4 scale, where 0 indicated no subjective complaints and 4 indicated severe functional impairment due to sleepiness.</p></sec><sec id="s2-7-5-2"><title>Objective Sleepiness Derivation</title><p>The MSLT was used to obtain an objective index of daytime sleepiness. Since shorter latencies reflect greater sleepiness, raw MSLT values were inverted and scaled onto a 0&#x2010;4 range: objective score = ((20 &#x2013; MSLT latency) / 20)&#x00D7;4. This allowed direct comparison of subjective and objective sleepiness on the same scale.</p></sec><sec id="s2-7-5-3"><title>Paired Comparison and Findings</title><p>To evaluate the degree of alignment between subjective and objective sleepiness, we performed a paired-sample <italic>t</italic> test. For each patient, we computed the difference between the harmonized KSS sleepiness stage and GPT-2-RL&#x2013;derived text score and the corresponding objective sleepiness score derived from MSLT latency.</p><p>The paired-sample <italic>t</italic> test evaluates whether the mean difference between the subjective and objective scores across all patients significantly differs from zero. The test statistic is calculated using:</p><disp-formula id="E13"><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mover><mml:mi>d</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>d</italic> is the mean of the differences between paired observations:</p><disp-formula id="equWL11"><mml:math id="eqn7"><mml:mover accent="false"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>&#x00AF;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></disp-formula><p><italic>S<sub>d</sub></italic> is the standard deviation of the differences:</p><disp-formula id="equWL12"><mml:math id="eqn8"><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msqrt><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mfrac><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mover accent="false"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>&#x00AF;</mml:mo></mml:mover><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mrow></mml:msqrt></mml:math></disp-formula><p><italic>n</italic> is the number of paired samples, <italic>x<sub>i</sub></italic> is the subjective score for patient <italic>i</italic>, and <italic>y<sub>i</sub></italic> is the objective score for patient <italic>i</italic>.</p></sec></sec></sec><sec id="s2-8"><title>Ethical Considerations</title><p>Ethical approval for this study was obtained from the Kantonale Ethik Kommission Bern (Cantonal Ethics Committee Bern) for multiple project components: Project part 1: SNS Project (2000-2016), BASEC-ID 2016-00409 and Project part 2: Bern Sleep Registry (&#x201C;The sleep disorder patient cohort of the Inselspital, University Hospital Bern<italic>&#x201D;</italic>), KEK-Nr. 2022-00415.</p><p>The secondary use of data from the Bern Sleep Registry was also approved by the Cantonal Ethics Committee. All data were handled in accordance with institutional and Swiss data protection regulations. Informed consent was obtained as required for each project component, and all participants were informed of their ability to opt out of data use. All data were fully de-identified prior to analysis to protect participant privacy and ensure compliance with applicable data protection standards.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>This section presents the results of our aspect-based sentiment analysis pipeline, including [<xref ref-type="bibr" rid="ref1">1</xref>] 5-stage sentiment classification using transformer-based models, and [<xref ref-type="bibr" rid="ref2">2</xref>] LLM-based extraction of clinical entity aspects. The analyses were applied to a dataset of sleep-related medical texts and further supported by clinical insights into subjective-objective misperception.</p></sec><sec id="s3-2"><title>Evaluation of the Aspect-Based Sentiment Analysis</title><p><xref ref-type="table" rid="table9">Table 9</xref> summarizes the performance of several models for 5-stage sentiment classification, evaluated using standard multiclass metrics. Initial models such as BERT and RoBERTa achieved moderate performance (accuracy 61% and 69%, respectively). Incorporating a BiLSTM layer improved performance to 78%. Further gains were achieved with a fine-tuned GPT-2 model. SFT alone yielded 81% accuracy, and RL with logit-space modulation increased it to 87%. These results demonstrate the effectiveness of multistage alignment in modeling sentiment nuances. Metrics reported in <xref ref-type="table" rid="table9">Table 9</xref> are macro-averaged across 5 sentiment classes, based on an 80/20 stratified test split.</p><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Performance of sentiment classi&#xFB01;cation of semiautomatically annotated 5-stage sentiment. Training: 1200 sentences (60%), Validation: 400 (20%), Test: 400 (20%) for hyperparameter optimization.</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Sentiment task</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">AUC-ROC<sup><xref ref-type="table-fn" rid="table9fn1">a</xref></sup></td><td align="left" valign="bottom">Precision (macro)</td><td align="left" valign="bottom">Recall (macro)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (macro)</td></tr></thead><tbody><tr><td align="left" valign="top">BERT 5-stage</td><td align="char" char="." valign="top">0.61</td><td align="char" char="." valign="top">0.76</td><td align="char" char="." valign="top">0.62</td><td align="char" char="." valign="top">0.60</td><td align="char" char="." valign="top">0.61</td></tr><tr><td align="left" valign="top">RoBERTa 5-stage</td><td align="char" char="." valign="top">0.69</td><td align="char" char="." valign="top">0.84</td><td align="char" char="." valign="top">0.70</td><td align="char" char="." valign="top">0.68</td><td align="char" char="." valign="top">0.69</td></tr><tr><td align="left" valign="top">RoBERTa+BiLSTM</td><td align="char" char="." valign="top">0.78</td><td align="char" char="." valign="top">0.90</td><td align="char" char="." valign="top">0.79</td><td align="char" char="." valign="top">0.77</td><td align="char" char="." valign="top">0.78</td></tr><tr><td align="left" valign="top">GPT-2+SFT<sup><xref ref-type="table-fn" rid="table9fn2">b</xref></sup></td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.93</td><td align="char" char="." valign="top">0.82</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.81</td></tr><tr><td align="left" valign="top">GPT-2+SFT+RL1<sup><xref ref-type="table-fn" rid="table9fn3">c</xref></sup>+ logit mod</td><td align="char" char="." valign="top">0.85</td><td align="char" char="." valign="top">0.95</td><td align="char" char="." valign="top">0.86</td><td align="char" char="." valign="top">0.85</td><td align="char" char="." valign="top">0.85</td></tr><tr><td align="left" valign="top">GPT-2+SFT+RL2+logit mod</td><td align="char" char="." valign="top">0.87</td><td align="char" char="." valign="top">0.96</td><td align="char" char="." valign="top">0.88</td><td align="char" char="." valign="top">0.87</td><td align="char" char="." valign="top">0.87</td></tr></tbody></table><table-wrap-foot><fn id="table9fn1"><p><sup>a</sup>AUC-ROC: area under the receiver operating characteristic curve.</p></fn><fn id="table9fn2"><p><sup>b</sup>SFT: supervised &#xFB01;ne-tuning.</p></fn><fn id="table9fn3"><p><sup>c</sup>RL: reinforcement learning.</p></fn></table-wrap-foot></table-wrap><p>All metrics were computed using macro-averaging, giving equal weight to all sentiment classes. The AUC-ROC (area under the receiver operating characteristic curve) values were obtained using a one-vs-rest strategy, macro-averaged across 5 classes. These approaches ensure that both frequent and rare sentiment classes are equally represented in performance assessment.</p><p><xref ref-type="fig" rid="figure4">Figure 4</xref> visualizes the ROC curves of the models. GPT-2 variants with SFT+RL training consistently outperformed the baselines, reflecting enhanced class separability through reinforcement learning and logit regularization.</p><p>The ROC-AUC curves illustrate the one-vs-rest performance of 6 models on the 5-stage sentiment classification task using clinical text from sleep disorder records. All models perform substantially above the chance level (AUC=0.5), with performance steadily improving from BERT (AUC=0.76) to RoBERTa (AUC=0.84), and further to RoBERTa+BiLSTM (AUC=0.90). Fine-tuned GPT-2 (SFT) achieves strong performance (AUC=0.93), which is further enhanced by reinforcement learning (RL1 and RL2) and logit modification, reaching up to AUC=0.96. The stair-step shapes reflect the small test set (400 samples), but the consistent trend shows that reinforcement learning significantly boosts the model&#x2019;s ability to distinguish fine-grained sentiment levels, supporting its potential for clinical decision support in contexts such as misperception of sleepiness.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>AUC-ROC (area under the receiver operating characteristic curves) for 5-class sentiment classification across transformer-based models. SFT: supervised fine-tuning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e70753_fig04.png"/></fig></sec><sec id="s3-3"><title>Aspect Extraction Through LLM Falcon 40B and Mixtral 8X7B</title><p>Clinical entity extraction was performed using prompting strategies applied to Falcon 40B and Mixtral 8X7B models. <xref ref-type="table" rid="table10">Table 10</xref> compares their performance on 4 annotated categories: Symptoms, Diseases, Complaints, and Feedback. Mixtral 8X7B showed strong performance with macro <italic>F</italic><sub>1</sub> of 0.8490, outperforming Falcon 40B (macro <italic>F</italic><sub>1</sub> of 0.7265).</p><table-wrap id="t10" position="float"><label>Table 10.</label><caption><p>Performance of the aspect extraction and benchmarks: 150 manually labeled sentiment aspects.</p></caption><table id="table10" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Performance of aspect extraction</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom">Micro <italic>F</italic><sub>1</sub></td><td align="left" valign="bottom">AUC-ROC<sup><xref ref-type="table-fn" rid="table10fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Falcon 40B instruct entity extraction</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Symptoms</td><td align="char" char="." valign="top">0.8157</td><td align="char" char="." valign="top">0.7963</td><td align="char" char="." valign="top">0.8054</td><td align="char" char="." valign="top">0.903</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diseases</td><td align="char" char="." valign="top">0.8321</td><td align="char" char="." valign="top">0.8485</td><td align="char" char="." valign="top">0.8397</td><td align="char" char="." valign="top">0.920</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complaints</td><td align="char" char="." valign="top">0.7829</td><td align="char" char="." valign="top">0.7134</td><td align="char" char="." valign="top">0.7466</td><td align="char" char="." valign="top">0.873</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Feedback</td><td align="char" char="." valign="top">0.5416</td><td align="char" char="." valign="top">0.4896</td><td align="char" char="." valign="top">0.5142</td><td align="char" char="." valign="top">0.757</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Marco <italic>F</italic><sub>1</sub></td><td align="char" char="." valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table10fn2">b</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">0.7265</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5">Mixtral 8X7B Instruct v0.1 entity extraction</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Symptoms</td><td align="char" char="." valign="top">0.9175</td><td align="char" char="." valign="top">0.9114</td><td align="char" char="." valign="top">0.9054</td><td align="char" char="." valign="top">0.953</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diseases</td><td align="char" char="." valign="top">0.8812</td><td align="char" char="." valign="top">0.8804</td><td align="char" char="." valign="top">0.8797</td><td align="char" char="." valign="top">0.940</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complaints</td><td align="char" char="." valign="top">0.8411</td><td align="char" char="." valign="top">0.8338</td><td align="char" char="." valign="top">0.8266</td><td align="char" char="." valign="top">0.913</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Feedback</td><td align="char" char="." valign="top">0.8616</td><td align="char" char="." valign="top">0.8028</td><td align="char" char="." valign="top">0.8312</td><td align="char" char="." valign="top">0.903</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Marco <italic>F</italic><sub>1</sub></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">0.8490</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table10fn1"><p><sup>a</sup>AUC-ROC: area under the receiver operating characteristic curves. </p></fn><fn id="table10fn2"><p><sup>b</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p>Since Mixtral 8X7B consistently outperformed Falcon 40B across all evaluated categories, we applied Mixtral 8X7B for all entity extraction tasks in our sentiment analysis pipeline. Beyond the benchmark evaluation, the model extracted a total of 873 unique symptom terms, along with 59 distinct diseases, 432 complaints, and 224 feedback-related entities across the full clinical corpus.</p><sec id="s3-3-1"><title>Sentiment Distribution Patterns</title><p>Currently, there are no established objective ground truth standards for evaluating misperception in insomnia, and for disorders such as hypersomnia or sleep-disordered breathing, further clinical validation is required. Thus, our focus was placed on daytime sleepiness, where both subjective (KSS) and objective (MSLT and MWT) data were consistently available.</p><p>The KSS and the ESS both assess subjective sleepiness but differ in temporal focus: KSS captures momentary state-level sleepiness, while ESS reflects trait-level sleepiness across habitual situations. Because KSS is time-specific, it aligns more directly with objective measures such as MSLT latency.</p><p>In this study, we focus exclusively on KSS&#x2014;with or without clinical text&#x2014;to assess momentary misperception in daytime sleepiness. This approach avoids confounding from chronic perception scales such as ESS and enables precise modeling of state-dependent discrepancies.</p><p><xref ref-type="fig" rid="figure5">Figures 5</xref><xref ref-type="fig" rid="figure6"/>-<xref ref-type="fig" rid="figure7">7</xref> visualize the distribution of sentiment values across different clinical contexts. <xref ref-type="fig" rid="figure5">Figure 5</xref> presents sentiment grouped by clinical entity (eg, Sleep quality, Symptoms, and Feedback). Positive sentiment distributions were observed for Feedback and Sleep Hygiene, while negative sentiment dominated in entities such as Complaints and Diseases. <xref ref-type="fig" rid="figure6">Figure 6</xref> shows sentiment variation across document types: patient-generated narratives (P) and Actigraphy Wrist (P-AW) reports tended to be more emotionally expressive than historical records (Hist). <xref ref-type="fig" rid="figure7">Figure 7</xref> aggregates sentiment by clinical aspect groupings, showing that Sleep quality included more variability in sentiment, whereas Fatigue and Day sleepiness showed predominantly negative sentiment patterns&#x2014;suggesting a greater emotional burden associated with these complaints.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Sentiment distribution by clinical entity group. FSS: Fatigue Severity Scale; MSLT: Multiple Sleep Latency Test; MWT: Maintenance of Wakefulness Test.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e70753_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Sentiment distribution by document type (Hist:History, P:Polysomnography, P-AW:Actigraphy Wrist), 5-stage sentiment score normalized into value between 0 and 1.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e70753_fig06.png"/></fig><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Sentiment distribution by sleep disorder aspect (quality, fatigue, and sleepiness).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e70753_fig07.png"/></fig></sec><sec id="s3-3-2"><title>Subjective-Objective Mismatch in Sleepiness Reporting</title><p>To examine potential misperceptions in day sleepiness, we compared subjective KSS reports with objective latency data from MSLT and MWT protocols. KSS values were normalized to a 0&#x2010;1 range using the formula: KSS_normalized=(9&#x2013;Raw Score)/(9&#x2013;1) This transformation enables alignment with the inverse latency scale.</p><p><xref ref-type="fig" rid="figure8">Figure 8</xref> presents the misperception score distribution across 3 test conditions: 10 minutes pair: mslt_kssdur_10 versus latency_10, 12 minutes pair: mslt_kssdur_12 versus latency_12, and mwt_kssdur versus latency. Misperception was defined as the deviation between normalized KSS and expected sleep latency levels. Across all pairings, positive misperception scores indicate that patients tend to report higher subjective sleepiness than measured objectively. The largest variability appeared in the mslt_kssdur_12 condition, while MWT tests showed narrower error bands.</p><p><xref ref-type="table" rid="table11">Table 11</xref> presents a representative subset of 10 patients illustrating questionnaire-expert-validated, text-predicted, and measured sleepiness scores. Although individual patterns vary, a general trend of overestimated sleepiness is observable.</p><fig position="float" id="figure8"><label>Figure 8.</label><caption><p>Mismatch between subjective sleepiness (Karolinska Sleepiness Scale) and objective sleep latency (Multiple Sleep Latency Test and Maintenance of Wakefulness Test) in day sleepiness documents with a threshold of 10 and 12 minutes of latency. KSS: Karolinska Sleepiness Scale; MSLT: Multiple Sleep Latency Test; MWT: Maintenance of Wakefulness Test.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e70753_fig08.png"/></fig><table-wrap id="t11" position="float"><label>Table 11.</label><caption><p>A snapshot and working example with 10 patients from the selected cohort of 100 patients for day sleepiness, illustrating normalized Karolinska Sleepiness Scale (momentary) and text-derived score, expert-labeled and predicted subjective scores, along with corresponding Multiple Sleep Latency Test latencies and normalized objective scores.</p></caption><table id="table11" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Patient ID</td><td align="left" valign="bottom">KSS<sup><xref ref-type="table-fn" rid="table11fn1">a</xref></sup> stage</td><td align="left" valign="bottom">Text sentiment stage</td><td align="left" valign="bottom">MSLT<sup><xref ref-type="table-fn" rid="table11fn2">b</xref></sup> latency (min)</td><td align="left" valign="bottom">Objective score (0&#x2010;4)</td><td align="left" valign="bottom">Differences KSS MSLT</td><td align="left" valign="bottom">Differences text MSLT</td></tr></thead><tbody><tr><td align="left" valign="top">P001</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">5.04</td><td align="left" valign="top">2.992</td><td align="left" valign="top">&#x2212;0.992</td><td align="char" char="." valign="top">&#x2212;0.992</td></tr><tr><td align="left" valign="top">P002</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1.12</td><td align="left" valign="top">3.78</td><td align="left" valign="top">&#x2212;1.78</td><td align="char" char="." valign="top">&#x2212;2.78</td></tr><tr><td align="left" valign="top">P003</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">7.76</td><td align="left" valign="top">2.45</td><td align="left" valign="top">0.55</td><td align="char" char="." valign="top">&#x2212;0.45</td></tr><tr><td align="left" valign="top">P004</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1.13</td><td align="left" valign="top">3.77</td><td align="left" valign="top">&#x2212;3.77</td><td align="char" char="." valign="top">&#x2212;2.77</td></tr><tr><td align="left" valign="top">P005</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">0.14</td><td align="left" valign="top">3.97</td><td align="left" valign="top">&#x2212;0.97</td><td align="char" char="." valign="top">&#x2212;2.97</td></tr><tr><td align="left" valign="top">P006</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">15.99</td><td align="left" valign="top">0.80</td><td align="left" valign="top">1.20</td><td align="char" char="." valign="top">0.20</td></tr><tr><td align="left" valign="top">P007</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">9.9</td><td align="left" valign="top">2.02</td><td align="left" valign="top">&#x2212;2.02</td><td align="char" char="." valign="top">&#x2212;1.02</td></tr><tr><td align="left" valign="top">P008</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">10.54</td><td align="left" valign="top">1.89</td><td align="left" valign="top">1.11</td><td align="char" char="." valign="top">&#x2212;0.89</td></tr><tr><td align="left" valign="top">P009</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">8.83</td><td align="left" valign="top">2.23</td><td align="left" valign="top">0.77</td><td align="char" char="." valign="top">0.77</td></tr><tr><td align="left" valign="top">P010</td><td align="char" char="." valign="top">4</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">9.93</td><td align="left" valign="top">2.01</td><td align="left" valign="top">1.99</td><td align="char" char="." valign="top">&#x2212;0.01</td></tr></tbody></table><table-wrap-foot><fn id="table11fn1"><p><sup>a</sup>KSS: Karolinska Sleepiness Scale.</p></fn><fn id="table11fn2"><p><sup>b</sup>MSLT: Multiple Sleep Latency Test.</p></fn></table-wrap-foot></table-wrap><p>A paired-sample <italic>t</italic> test conducted across the full cohort (n=100) revealed distinct patterns in the alignment between subjective and objective sleepiness assessments. While KSS scores showed a borderline nonsignificant divergence from MSLT-derived latencies (<italic>t</italic><sub>99</sub>=2.456, <italic>P</italic>=.06), text-derived sentiment scores demonstrated a statistically significant misalignment (<italic>t</italic><sub>99</sub>=2.324, <italic>P</italic>=.05). These findings suggest that free-text clinical narratives more consistently diverge from physiological measures of sleepiness than structured scales such as the KSS. This may reflect the emotionally nuanced and context-rich nature of patient-reported symptoms embedded in narrative records. Importantly, although both subjective measures showed overestimation of sleepiness, only the text-derived sentiment reached statistical significance (<italic>P</italic>=.05), indicating a consistent, though not necessarily larger, misalignment with physiological data. More details can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study introduced a sentiment analysis framework for clinical sleep narratives, revealing key insights into both method performance and clinical relevance. The BERT-BiLSTM architecture improved sentiment accuracy by addressing negations and drift, while the Mixtral 8X7B model outperformed Falcon 40B in aspect extraction due to better handling of complex German syntax. A fine-tuned GPT-2 model with reinforcement learning achieved high sentiment classification accuracy (87%) and offered a resource-efficient alternative to LLMs. Clinically, sentiment-derived scores revealed consistent misperception patterns&#x2014;such as underestimated sleep latency and overestimated duration&#x2014;highlighting the importance of aligning subjective and objective measures. These findings support the integration of sentiment-informed misperception analysis into sleep medicine workflows to improve diagnosis, treatment selection, and patient safety.</p></sec><sec id="s4-2"><title>Method Evaluation</title><p>In this study, we introduced and evaluated an aspect-based sentiment analysis approach tailored to clinical narratives in sleep medicine. Several key observations emerged from our findings. First, when mapping sentiment model outputs to scalar indicators of sleep disorders, it became necessary to adjust for negations and drifted sentiment&#x2014;especially in baseline BERT models. Phrases that negated the presence of symptoms or diagnoses were frequently misinterpreted as negative sentiment. This issue was mitigated using a BERT-BiLSTM architecture, which improved performance through representation adaptation and fine-tuning.</p><p>For aspect (entity) extraction using prompted LLMs, recall was generally low, particularly for feedback-related content. In contrast, precision was high for identifying symptoms, diseases, and complaints. The poor recall is primarily attributed to the token limit and the generation style of Falcon 40B under one-shot prompting. Interestingly, few-shot prompting increased precision but further reduced recall and <italic>F</italic><sub>1</sub>-score. This reveals a trade-off that may be addressed either by increasing computational capacity (current setup: 128 GB Nvidia DGX server) or optimizing prompting strategies.</p><p>The Mixtral 8X7B model significantly outperformed Falcon 40B, delivering higher precision and recall for aspect recognition, even under single-shot settings. Its ability to handle longer input contexts and complex linguistic structures&#x2014;such as German postnegation&#x2014;proved particularly advantageous in the clinical domain. These results support the use of Mixtral 8x7B for German-language medical NLP tasks where nuanced comprehension is critical.</p><p>Compared to BERT-BiLSTM, a fine-tuned GPT-2 model trained on 1200 sleep-related clinical sentences achieved comparable performance (accuracy: 81%). Incorporating keyword-guided SFT and 2 RL epochs with logit modification raised accuracy to 87%. This dual-phase tuning strategy enhanced the model&#x2019;s flexibility and control. It also outperformed sequential prompting approaches commonly used with larger LLMs&#x2014;offering higher precision with lower resource requirements, particularly important in privacy-sensitive clinical settings.</p></sec><sec id="s4-3"><title>Clinical Outcomes</title><p>The interpretation of sentiment-derived misperception patterns must consider both the nature of subjective reporting and the inherent limitations of objective measurement in clinical sleep data. Misperception&#x2014;defined as a mismatch between how patients feel and how their physiological state is measured&#x2014;varies across dimensions of sleep disorders, with important diagnostic and therapeutic implications.</p><p>A key finding in our study was the divergence in sentiment-derived scores for similar clinical constructs. For instance, patients frequently underestimated their sleep latency (eg, reporting quick sleep onset) yet simultaneously overestimated their sleep duration, contrary to PSG and MSLT results. This subjective-objective mismatch&#x2014;commonly referred to as sleep misperception&#x2014;was particularly evident across latency, duration, and fatigue indicators. While traditional statistical tests may not fully capture the nuances of sentiment-informed scores, the consistent directionality and magnitude of these mismatches highlight their clinical importance.</p><p>However, the objectification of sleep quality and fatigue is inherently complex. Short sleep latency, often seen as a marker of healthy sleep, can paradoxically indicate pathological hypersomnolence in conditions such as narcolepsy. Similarly, fatigue&#x2014;unlike sleepiness&#x2014;lacks a gold-standard physiological test and is primarily self-reported, making its objectification more elusive. These nuances emphasize the need to interpret subjective scores considering disorder-specific clinical contexts.</p><p>In contrast, daytime sleepiness offers a more quantifiable target for misperception analysis. Here, subjective ratings such as the KSS and text-derived sentiment scores can be directly compared with objective markers such as MSLT latency. The KSS is particularly useful in this context, as it captures state-based, momentary sleepiness, allowing for meaningful trial-by-trial comparisons. For example, KSS values close to 0 (indicating &#x201C;extremely sleepy&#x201D;) should theoretically align with MSLT latencies under 8 minutes. In our analysis framework, misperception magnitude increases when a patient reports low KSS (ie, &#x201C;did not fall asleep&#x201D;) while objective data confirm they did&#x2014;for example, falling asleep within 10&#x2010;20 minutes in MSLT trials. This alignment enables fine-grained modeling of momentary misperception and supports targeted subanalysis of repeated MSLT and KSS pairs (eg, mslt_kssdur_10 vs mslt_sleeplatency_10).</p><p>From a clinical perspective, understanding the type and direction of misperception has actionable relevance.</p><p>For instance, in insomnia, patients often report exaggerated sleep difficulties (eg, longer sleep latency than measured). These cases may benefit more from cognitive behavioral therapy for insomnia rather than pharmacotherapy. Discussing the misperception with the patient is often therapeutic. However, because objective tests such as PSG are rarely used in routine insomnia management, wearable technologies (eg, smartwatches) could serve as surrogate tools to estimate sleep latency, duration, and fragmentation&#x2014;offering scalable insight into real-world misperception.</p><p>In hypersomnia or sleep-disordered breathing, the opposite trend may occur&#x2014;patients feel they slept well but exhibit abnormal sleep architecture or excessive daytime sleepiness in tests. In such cases, unrecognized sleepiness can elevate safety risks, such as motor vehicle or occupational accidents. Early identification of these cases using sentiment-objective comparison could trigger earlier clinical intervention or patient education.</p><p>Altogether, our sentiment analysis framework, enriched with structured objective metrics, offers a promising route for personalized assessment of sleep misperception. By comparing KSS, ESS, and text-derived sentiment scores against MSLT latency across multiple configurations, we can determine which input combination best captures the true subjective state. This multipronged analysis will inform future iterations of our clinical decision-support tools and help stratify patients for tailored treatment approaches.</p><sec id="s4-3-1"><title>Clinical Applications</title><p>Integrating sentiment analysis and misperception scoring into clinical workflows opens up new avenues for personalized, perception-aware sleep medicine:</p><list list-type="order"><list-item><p>Flag discrepancies in sleep latency and duration for targeted discussion or reassessment.</p></list-item><list-item><p>Stratify patients by degree of misperception to inform whether cognitive behavioral therapy for insomnia or pharmacotherapy is appropriate.</p></list-item><list-item><p>Highlight the underestimation of sleepiness in MSLT and MWT contexts for workplace or safety interventions.</p></list-item><list-item><p>Incorporate wearable data to validate or challenge patient self-reporting in follow-up care.</p></list-item></list><p>By aligning sentiment-derived patterns with objective sleep markers, clinicians can better understand not only what the patient reports but how they perceive their condition&#x2014;a critical step toward improving diagnostic accuracy, treatment matching, and long-term outcomes.</p></sec></sec><sec id="s4-4"><title>Conclusion and Future Work</title><p>This study demonstrates that sentiment analysis applied to structured clinical narratives can uncover meaningful patterns in how patients perceive and report sleep disorders&#x2014;especially in identifying discrepancies between subjective self-reports and objective assessments. Among the models evaluated, the BERT-BiLSTM architecture showed strong performance in domain-specific sentiment detection. However, the fine-tuned German GPT-2 model, enhanced with supervised and reinforcement learning (PPO), achieved the highest accuracy for 5-stage sentiment classification, offering an effective balance between adaptability and computational efficiency.</p><p>Our findings highlight the clinical value of sentiment-based misperception modeling. For example, patients with insomnia often overestimate wakefulness, while those with hypersomnia or sleep apnea tend to underreport daytime sleepiness. The use of reinforcement learning techniques proved especially effective for capturing emotionally nuanced language in clinical narratives while maintaining sensitivity to domain-specific clinical distinctions.</p><p>However, this study is not without limitations. All data were drawn from a single clinical registry, which may limit the generalizability of findings across institutions or populations. Furthermore, the annotated training set for sentiment and entity extraction was relatively small, potentially constraining model robustness and the diversity of learned patterns.</p><p>In future work, we aim to expand the dataset by incorporating records from additional clinical centers and to enrich the annotation set to enable more nuanced model training and validation. Sentiment modeling will be extended to cover additional clinical entities such as pain and cognitive symptoms, while correlations between sentiment-derived misperception scores and treatment outcomes or resistance will be further explored. We also plan to integrate physiological features&#x2014;such as microarousals and sleep stage transitions&#x2014;to improve subtype classification and to leverage wearable sensor data to validate and calibrate sentiment-based misperception metrics. Ultimately, our goal is to develop a precision sleep medicine framework that integrates objective physiological markers with patients&#x2019; emotional and cognitive interpretations of their symptoms, captured through advanced sentiment and entity modeling of clinical narratives.</p></sec></sec></body><back><ack><p>The secondary usage of Bern Sleep Data Base (BSDB) from Inselspital, University Hospital Bern, was approved by the local ethics committee (KEK-Nr. 2022&#x2010;00415).</p></ack><notes><sec><title>Data Availability</title><p>The datasets analyzed in this study are not publicly available due to data privacy considerations [<xref ref-type="bibr" rid="ref31">31</xref>], but they may be obtained from the institutional review board of Inselspital upon reasonable application.</p></sec></notes><fn-group><fn fn-type="con"><p>YD and KD contributed to the conceptualization, methodology, validation, and original draft preparation. MS, CLAB, and JVDM were responsible for data curation. YD conducted the investigation. YD, KD, JVDM, AT, MS, and CLAB reviewed and edited the manuscript. AT also contributed to conceptualization.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AHI</term><def><p>Apnea-Hypopnea Index</p></def></def-item><def-item><term id="abb2">ESS</term><def><p>Epworth Sleepiness Scale</p></def></def-item><def-item><term id="abb3">FSS</term><def><p>Fatigue Severity Scale</p></def></def-item><def-item><term id="abb4"><italic>ICSD-III</italic></term><def><p> <italic>International Classi&#xFB01;cation of Sleep Disorders, Third Edition</italic></p></def></def-item><def-item><term id="abb5">KSS</term><def><p>Karolinska Sleepiness Scale</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">MSLT</term><def><p>Multiple Sleep Latency Test</p></def></def-item><def-item><term id="abb8">MWT</term><def><p>Maintenance of Wakefulness Test</p></def></def-item><def-item><term id="abb9">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb10">P-AW</term><def><p>Actigraphy Wrist</p></def></def-item><def-item><term id="abb11">PPO</term><def><p>Proximal Policy Optimization</p></def></def-item><def-item><term id="abb12">PSG</term><def><p>polysomnography</p></def></def-item><def-item><term id="abb13">RL</term><def><p>reinforcement learning</p></def></def-item><def-item><term id="abb14">SFT</term><def><p>supervised fine-tuning</p></def></def-item><def-item><term id="abb15">SOL</term><def><p>sleep onset latency</p></def></def-item><def-item><term id="abb16">TRL</term><def><p>Transformer Reinforcement Learning</p></def></def-item><def-item><term id="abb17">TST</term><def><p>total sleep time</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Michal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wiltink</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kirschner</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Complaints of sleep disturbances are associated with cardiovascular disease: results from the Gutenberg Health Study</article-title><source>PLoS One</source><year>2014</year><volume>9</volume><issue>8</issue><fpage>e104324</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0104324</pub-id><pub-id pub-id-type="medline">25093413</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kotterba</surname><given-names>S</given-names> </name><name name-style="western"><surname>Neusser</surname><given-names>T</given-names> </name><name name-style="western"><surname>Norenberg</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Sleep quality, daytime sleepiness, fatigue, and quality of life in patients with multiple sclerosis treated with interferon beta-1b: results from a prospective observational cohort study</article-title><source>BMC Neurol</source><year>2018</year><month>08</month><day>24</day><volume>18</volume><issue>1</issue><fpage>123</fpage><pub-id pub-id-type="doi">10.1186/s12883-018-1113-5</pub-id><pub-id pub-id-type="medline">30143019</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mendonca</surname><given-names>F</given-names> </name><name name-style="western"><surname>Mostafa</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Morgado-Dias</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ravelo-Garcia</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Penzel</surname><given-names>T</given-names> </name></person-group><article-title>A review of approaches for sleep quality analysis</article-title><source>IEEE Access</source><year>2019</year><volume>7</volume><fpage>24527</fpage><lpage>24546</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2019.2900345</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zavecz</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Nagy</surname><given-names>T</given-names> </name><name name-style="western"><surname>Galk&#x00F3;</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nemeth</surname><given-names>D</given-names> </name><name name-style="western"><surname>Janacsek</surname><given-names>K</given-names> </name></person-group><article-title>The relationship between subjective sleep quality and cognitive performance in healthy young adults: evidence from three empirical studies</article-title><source>Sci Rep</source><year>2020</year><month>03</month><day>17</day><volume>10</volume><issue>1</issue><fpage>4855</fpage><pub-id pub-id-type="doi">10.1038/s41598-020-61627-6</pub-id><pub-id pub-id-type="medline">32184462</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>O&#x2019;Donnell</surname><given-names>D</given-names> </name><name name-style="western"><surname>Silva</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>M&#x00FC;nch</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ronda</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Duffy</surname><given-names>JF</given-names> </name></person-group><article-title>Comparison of subjective and objective assessments of sleep in healthy older subjects without sleep complaints</article-title><source>J Sleep Res</source><year>2009</year><month>06</month><volume>18</volume><issue>2</issue><fpage>254</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2869.2008.00719.x</pub-id><pub-id pub-id-type="medline">19645969</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aurora</surname><given-names>RN</given-names> </name><name name-style="western"><surname>Caffo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Crainiceanu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Punjabi</surname><given-names>NM</given-names> </name></person-group><article-title>Correlating subjective and objective sleepiness: revisiting the association using survival analysis</article-title><source>Sleep</source><year>2011</year><month>12</month><day>1</day><volume>34</volume><issue>12</issue><fpage>1707</fpage><lpage>1714</lpage><pub-id pub-id-type="doi">10.5665/sleep.1442</pub-id><pub-id pub-id-type="medline">22131609</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stanyte</surname><given-names>A</given-names> </name><name name-style="western"><surname>Podlipskyte</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alonderis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Macijauskiene</surname><given-names>J</given-names> </name><name name-style="western"><surname>Burkauskas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Steibliene</surname><given-names>V</given-names> </name></person-group><article-title>Relationship between subjective and objective fatigue and sleep characteristics in individuals with anxiety and mood disorders: an exploratory study</article-title><source>Physiol Behav</source><year>2024</year><month>02</month><day>1</day><volume>274</volume><fpage>114429</fpage><pub-id pub-id-type="doi">10.1016/j.physbeh.2023.114429</pub-id><pub-id pub-id-type="medline">38065423</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Valko</surname><given-names>PO</given-names> </name><name name-style="western"><surname>Hunziker</surname><given-names>S</given-names> </name><name name-style="western"><surname>Graf</surname><given-names>K</given-names> </name><name name-style="western"><surname>Werth</surname><given-names>E</given-names> </name><name name-style="western"><surname>Baumann</surname><given-names>CR</given-names> </name></person-group><article-title>Sleep-wake misperception. A comprehensive analysis of a large sleep lab cohort</article-title><source>Sleep Med</source><year>2021</year><month>12</month><volume>88</volume><fpage>96</fpage><lpage>103</lpage><pub-id pub-id-type="doi">10.1016/j.sleep.2021.10.023</pub-id><pub-id pub-id-type="medline">34742039</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abbey</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Garfinkel</surname><given-names>PE</given-names> </name></person-group><article-title>Chronic fatigue syndrome and depression: cause, effect, or covariate</article-title><source>Rev Infect Dis</source><year>1991</year><volume>13 Suppl 1</volume><fpage>S73</fpage><lpage>83</lpage><pub-id pub-id-type="doi">10.1093/clinids/13.supplement_1.s73</pub-id><pub-id pub-id-type="medline">2020805</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hermans</surname><given-names>LWA</given-names> </name><name name-style="western"><surname>Leufkens</surname><given-names>TR</given-names> </name><name name-style="western"><surname>van Gilst</surname><given-names>MM</given-names> </name><etal/></person-group><article-title>Sleep EEG characteristics associated with sleep onset misperception</article-title><source>Sleep Med</source><year>2019</year><month>05</month><volume>57</volume><fpage>70</fpage><lpage>79</lpage><pub-id pub-id-type="doi">10.1016/j.sleep.2019.01.031</pub-id><pub-id pub-id-type="medline">30897458</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Herzog</surname><given-names>R</given-names> </name><name name-style="western"><surname>Crosbie</surname><given-names>F</given-names> </name><name name-style="western"><surname>Aloulou</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A continuous approach to explain insomnia and subjective-objective sleep discrepancy</article-title><source>Commun Biol</source><year>2025</year><month>03</month><day>12</day><volume>8</volume><issue>1</issue><fpage>423</fpage><pub-id pub-id-type="doi">10.1038/s42003-025-07794-6</pub-id><pub-id pub-id-type="medline">40075150</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><conf-name>Proceedings of the 2019 Conference of the North</conf-name><conf-date>Jun 2-9, 2019</conf-date><conf-loc>Minneapolis, Minnesota</conf-loc><fpage>4171</fpage><lpage>4186</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://aclweb.org/anthology/N19-1">http://aclweb.org/anthology/N19-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Behera</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Jena</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rath</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Misra</surname><given-names>S</given-names> </name></person-group><article-title>Co-LSTM: Convolutional LSTM model for sentiment analysis in social big data</article-title><source>Inf Process Manag</source><year>2021</year><volume>58</volume><issue>1</issue><fpage>102435</fpage><pub-id pub-id-type="doi">10.1016/j.ipm.2020.102435</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Denecke</surname><given-names>K</given-names> </name><name name-style="western"><surname>Reichenpfader</surname><given-names>D</given-names> </name></person-group><article-title>Sentiment analysis of clinical narratives: a scoping review</article-title><source>J Biomed Inform</source><year>2023</year><month>04</month><volume>140</volume><fpage>104336</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2023.104336</pub-id><pub-id pub-id-type="medline">36958461</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name></person-group><article-title>Convolutional neural networks for sentence classification</article-title><year>2014</year><conf-name>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-date>Oct 25-29, 2014</conf-date><pub-id pub-id-type="doi">10.3115/v1/D14-1181</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rehman</surname><given-names>AU</given-names> </name><name name-style="western"><surname>Malik</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Raza</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>W</given-names> </name></person-group><article-title>A hybrid CNN-LSTM model for improving accuracy of movie reviews sentiment analysis</article-title><source>Multimed Tools Appl</source><year>2019</year><month>09</month><volume>78</volume><issue>18</issue><fpage>26597</fpage><lpage>26613</lpage><pub-id pub-id-type="doi">10.1007/s11042-019-07788-7</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name></person-group><article-title>Attention-based bidirectional long short-term memory networks for relation classification using knowledge distillation from BERT</article-title><conf-name>2020 IEEE Intl Conf on Dependable, Autonomic and Secure Computing, Intl Conf on Pervasive Intelligence and Computing, Intl Conf on Cloud and Big Data Computing, Intl Conf on Cyber Science and Technology Congress (DASC/PiCom/CBDCom/CyberSciTech)</conf-name><conf-date>Aug 17-22, 2020</conf-date><pub-id pub-id-type="doi">10.1109/DASC-PICom-CBDCom-CyberSciTech49142.2020.00100</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Li</surname><given-names>T</given-names> </name></person-group><article-title>Temporal-spatial three-way granular computing for dynamic text sentiment classification</article-title><source>Inf Sci (Ny)</source><year>2022</year><month>06</month><volume>596</volume><fpage>551</fpage><lpage>566</lpage><pub-id pub-id-type="doi">10.1016/j.ins.2022.03.036</pub-id><pub-id pub-id-type="medline">35693835</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>PBCNN: packet bytes-based convolutional neural network for network intrusion detection</article-title><source>Comput Netw</source><year>2021</year><month>07</month><volume>194</volume><fpage>108117</fpage><pub-id pub-id-type="doi">10.1016/j.comnet.2021.108117</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name></person-group><article-title>Recurrent neural network for text classification with multi-task learning</article-title><conf-name>IJCAI&#x2019;16: Proceedings of the Twenty-Fifth International Joint Conference on Artificial Intelligence</conf-name><conf-date>Jul 19-15, 2016</conf-date><conf-loc>New York USA</conf-loc><fpage>2873</fpage><lpage>2879</lpage></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ziegler</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Stiennon</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>FineTuning language models from human preferences</article-title><source>arXiv</source><comment>Preprint posted online on 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.1909.08593</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Schulman</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wolski</surname><given-names>F</given-names> </name><name name-style="western"><surname>Dhariwal</surname><given-names>P</given-names> </name><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Klimov</surname><given-names>O</given-names> </name></person-group><article-title>Proximal policy optimization algorithms</article-title><source>arXiv</source><comment>Preprint posted online on 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1707.06347</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pascual</surname><given-names>D</given-names> </name><name name-style="western"><surname>Egressy</surname><given-names>B</given-names> </name><name name-style="western"><surname>Meister</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cotterell</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wattenhofer</surname><given-names>R</given-names> </name></person-group><article-title>A plug-and-play method for controlled text generation</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Nov 7-11, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.findings-emnlp.334</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Denecke</surname><given-names>K</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name></person-group><article-title>Sentiment analysis in medical settings: new opportunities and challenges</article-title><source>Artif Intell Med</source><year>2015</year><month>05</month><volume>64</volume><issue>1</issue><fpage>17</fpage><lpage>27</lpage><pub-id pub-id-type="doi">10.1016/j.artmed.2015.03.006</pub-id><pub-id pub-id-type="medline">25982909</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aellen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Van der Meer</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dietmann</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bassetti</surname><given-names>CLA</given-names> </name><name name-style="western"><surname>Tzovara</surname><given-names>A</given-names> </name></person-group><article-title>The Bern Sleep Database: clustering of patients with sleep disorders</article-title><source>Sleep Med</source><year>2022</year><month>12</month><volume>100</volume><fpage>S106</fpage><pub-id pub-id-type="doi">10.1016/j.sleep.2022.05.295</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Werra</surname><given-names>L</given-names> </name></person-group><article-title>Tune GPT-2 to generate controlled sentiment reviews. preprint, under review</article-title><source>Google Collab</source><access-date>2025-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://colab.research.google.com/drive/19CWnz8xhuV026nBuDEEUEMflTK9ENMIj?usp=sharing">https://colab.research.google.com/drive/19CWnz8xhuV026nBuDEEUEMflTK9ENMIj?usp=sharing</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Falcon 40B LLM repository</article-title><source>Hugging Face</source><access-date>2025-10-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/tiiuae/falcon-40b-instruct">https://huggingface.co/tiiuae/falcon-40b-instruct</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Werra</surname><given-names>L</given-names> </name><name name-style="western"><surname>Belkada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tunstall</surname><given-names>L</given-names> </name><etal/></person-group><article-title>TRL: transformer reinforcement learning</article-title><source>GitHub</source><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/huggingface/trl">https://github.com/huggingface/trl</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Song</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Song</surname><given-names>D</given-names> </name></person-group><article-title>A survey of controllable text generation using transformer-based pre-trained language models</article-title><source>ACM Comput Surv</source><year>2024</year><month>03</month><day>31</day><volume>56</volume><issue>3</issue><fpage>1</fpage><lpage>37</lpage><pub-id pub-id-type="doi">10.1145/3617680</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>Mixtral 8x7b repository</article-title><source>Hugging Face</source><access-date>2025-10-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>BERT base multilingual cased</article-title><source>Hugging Face</source><access-date>2025-10-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/google-bert/bert-base-multilingual-cased">https://huggingface.co/google-bert/bert-base-multilingual-cased</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>XLM-roberta base</article-title><source>Hugging Face</source><access-date>2025-10-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/FacebookAI/xlm-roberta-base">https://huggingface.co/FacebookAI/xlm-roberta-base</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Werra</surname><given-names>L</given-names> </name></person-group><article-title>Tune GPT-2 to generate positive reviews</article-title><source>Hugging Face</source><access-date>2025-10-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/tiiuae/falcon-40b-instruct">https://huggingface.co/tiiuae/falcon-40b-instruct</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><source>German GPT-2</source><access-date>2025-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/benjamin/gerpt2model">https://huggingface.co/benjamin/gerpt2model</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>Code repository</article-title><source>GitHub</source><access-date>2025-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/dyhpf/sleepmisperception">https://github.com/dyhpf/sleepmisperception</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Mismatch between subjective sleepiness (Karolinska Sleepiness Scale) and objective sleep latency (Multiple Sleep Latency Test) with a threshold of 10 and 12 minutes among 100 patients.</p><media xlink:href="medinform_v13i1e70753_app1.png" xlink:title="PNG File, 133 KB"/></supplementary-material></app-group></back></article>