<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e74902</article-id><article-id pub-id-type="doi">10.2196/74902</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>A Vision-Language&#x2013;Guided Multimodal Fusion Network for Glottic Carcinoma Early Diagnosis: Model Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Jin</surname><given-names>Zhaohui</given-names></name><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Shuai</surname><given-names>Yi</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Li</surname><given-names>Yun</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Mianmian</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Yumeng</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lei</surname><given-names>Wenbin</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Fan</surname><given-names>Xiaomao</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>College of Big Data and Internet, Shenzhen Technology University</institution><addr-line>Pingshan District, 3002 Lantian Road</addr-line><addr-line>Shenzhen</addr-line><addr-line>Guangdong</addr-line><country>China</country></aff><aff id="aff2"><institution>The First Affiliated Hospital, Sun Yat-sen University</institution><addr-line>Guangzhou</addr-line><addr-line>Guangdong</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Jin</surname><given-names>Qiao</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gu</surname><given-names>Ran</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Yang</surname><given-names>Zhikai</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Xiaomao Fan, PhD, College of Big Data and Internet, Shenzhen Technology University, Pingshan District, 3002 Lantian Road, Shenzhen, Guangdong, 518118, China, 86 19276679344; <email>astrofan2008@gmail.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>8</day><month>10</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e74902</elocation-id><history><date date-type="received"><day>27</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>04</day><month>08</month><year>2025</year></date><date date-type="accepted"><day>06</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Zhaohui Jin, Yi Shuai, Yun Li, Mianmian Chen, Yumeng Liu, Wenbin Lei, Xiaomao Fan. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 8.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e74902"/><abstract><sec><title>Background</title><p>Early diagnosis and intervention in glottic carcinoma (GC) can significantly improve long-term prognosis. However, the accurate diagnosis of early GC is challenging due to its morphological similarity to vocal cord dysplasia, with the difficulty further exacerbated in medically underserved areas.</p></sec><sec><title>Objective</title><p>This study aims to address the limitations of existing technologies by designing a vision-language multimodal model, providing a more efficient and accurate early diagnostic method for GC.</p></sec><sec sec-type="methods"><title>Methods</title><p>The data used in this study were sourced from the information system of the First Affiliated Hospital of Sun Yat-sen University, comprising laryngoscopy reports and 5796 laryngoscopic images from 404 patients with glottic lesions. We propose a vision-language&#x2013;guided multimodal fusion network (VLMF-Net) based on a large vision-language model for the early automated diagnosis of GC. The text processing module of this model uses the pretrained Large Language Model Meta AI (LLaMa) to generate text vector representations, while the image processing module uses a pretrained vision transformer to extract features from laryngoscopic images, achieving cross-modal alignment through the Q-Former module. By leveraging a feature fusion module, deep integration of text and image features is achieved, ultimately enabling classification diagnosis. To validate the model&#x2019;s performance, the study selected contrastive language-image pretraining (CLIP), bootstrapping language-image pretraining with frozen image encoders and large language models (BLIP-2), a large-scale image and noisy-text embedding (ALIGN), and vision-and-language transformer (VILT) as baseline methods for experimental evaluation on the same dataset, with comprehensive performance assessment conducted using accuracy, recall, precision, <italic>F</italic><sub>1</sub>-score, and area under the curve.</p></sec><sec sec-type="results"><title>Results</title><p>We found that on the internal test set, the VLMF-Net model significantly outperformed existing methods with an accuracy of 77.6% (CLIP: 70.5%; BLIP-2: 71.5%; ALIGN: 67.3%; and VILT: 64.3%), achieving a 6.1-percentage point improvement over the best baseline model (BLIP-2). On the external test set, our method also demonstrated robust performance, achieving an accuracy of 73.9%, which is 4.6 percentage points higher than the second-best model (BLIP-2: 69.3%). This indicates that our model surpasses these methods in the early diagnosis of GC and exhibits strong generalization ability and robustness.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The proposed VLMF-Net model can be effectively used for the early diagnosis of GC, helping to address the challenges in its early detection.</p></sec></abstract><kwd-group><kwd>glottic carcinoma early diagnosis</kwd><kwd>multimodal deep learning</kwd><kwd>large-scale foundation model</kwd><kwd>computer-aided diagnosis</kwd><kwd>clinical decision making</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Glottic carcinoma (GC) is a common malignant tumor of the head and neck [<xref ref-type="bibr" rid="ref1">1</xref>]. According to GLOBOCAN, China reported 29,500 new cases and 16,900 deaths from laryngeal cancer in 2022 [<xref ref-type="bibr" rid="ref2">2</xref>], posing heavy burdens on health care systems. Approximately 60% of patients are diagnosed at an advanced stage [<xref ref-type="bibr" rid="ref3">3</xref>], leading to significant impairment of vital physiological functions and compromising both physical and mental health. Early diagnosis of malignant tumors has been increasingly emphasized in clinical practice due to its potential to improve cure rates and organ function preservation [<xref ref-type="bibr" rid="ref4">4</xref>]. Therefore, optimizing diagnostic methods for GC and enhancing early detection capability are urgent tasks for otolaryngologists.</p><p>Laryngoscopy is the primary diagnostic tool for GC [<xref ref-type="bibr" rid="ref1">1</xref>], offering direct visualization of lesion shape, extent, and surface texture. When combined with narrow band imaging, it enhances early tumor detection by identifying neovascularization [<xref ref-type="bibr" rid="ref5">5</xref>], making it a valuable tool for early diagnosis. However, vocal cord dysplasia (VCD), a precancerous condition situated between normal epithelium and squamous cell carcinoma, is characterized by a small lesion with clinical and laryngoscopic features similar to early GC [<xref ref-type="bibr" rid="ref6">6</xref>]. Therefore, it is challenging for the human eye to distinguish between them. In addition, the lesion is often covered by &#x201C;leukoplakia-like&#x201D; substance, which interferes with the ability of narrow band imaging to reveal submucosal vasculature [<xref ref-type="bibr" rid="ref7">7</xref>], increasing the risk of misdiagnosis. Laryngoscopy reports provide textual descriptions of lesion morphology observed dynamically during the examination, supplementing static images and assisting in diagnostic decision-making. Studies have demonstrated a correlation between morphological grading and malignancy risk [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>], underscoring the diagnostic value of textual reports. Furthermore, reports authored by experienced clinicians serve as valuable references, facilitating more accurate diagnoses by less experienced clinicians. Histopathology examination remains the gold standard for diagnosis [<xref ref-type="bibr" rid="ref1">1</xref>]. However, biopsy is invasive, painful, and carries procedural risks [<xref ref-type="bibr" rid="ref11">11</xref>], impeding its widespread application in large-scale clinical screening. To address this issue, it is necessary to develop an efficient and noninvasive method to improve the diagnostic accuracy of early GC.</p><p>Recently, significant progress has been made in deep learning techniques for tackling real-world classification tasks in computer vision and natural language processing [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. Many researchers have sought to apply these models to the detection of laryngeal cancer, yielding promising outcomes. However, most existing methods only use laryngoscopic images as input, including UC-DenseNet [<xref ref-type="bibr" rid="ref15">15</xref>], MTANet [<xref ref-type="bibr" rid="ref16">16</xref>], Dlgnet [<xref ref-type="bibr" rid="ref17">17</xref>], RedFormer [<xref ref-type="bibr" rid="ref18">18</xref>], and SAM-FNet [<xref ref-type="bibr" rid="ref19">19</xref>]. Although these methods have demonstrated promising performance in laryngeal cancer detection and other tasks, they neglect certain latent information present in other modalities. Such information is typically inaccessible through unimodal approaches, thereby highlighting the advantages of multimodal methodologies [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Therefore, this study proposes a novel method named vision-language&#x2013;guided multimodal fusion network (VLMF-Net) for early diagnosis of GC. Addressing the limitations of traditional single-modality diagnostic methods, our approach integrates the images and reports text of laryngoscopy to provide a more comprehensive representation of lesion characteristics. Using a pretrained vision transformer (ViT) [<xref ref-type="bibr" rid="ref22">22</xref>] model for image feature extraction and a LlaMa3 [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>] model fine-tuned for text processing, we achieve effective multimodal feature fusion. Compared to single-modality methods, our approach significantly improves the diagnostic accuracy and robustness, achieving an accuracy of 0.776 on real clinical datasets. This study highlights the potential of multi-modal fusion in clinical auxiliary diagnosis and provides new insights for reducing misdiagnosis rates and improving patient treatment outcomes.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dataset</title><p>In our study, we constructed 2 datasets for model development and validation. First, we built an internal dataset for model training, validation, and testing. For the internal dataset, we collected data from 404 patients with glottic lesions at the First Affiliated Hospital of Sun Yat-sen University in Guangzhou, China. This dataset consists of 5799 professionally annotated image-text pairs, covering two types of lesions: VCD and GC. Each sample includes a laryngoscopic diagnostic report written by an experienced otolaryngologist and its corresponding laryngoscopic image.</p><p>In addition, to assess the model&#x2019;s generalization ability and robustness, we constructed an external dataset. The external dataset was collected from the First People&#x2019;s Hospital of Zhaoqing, consisting of data from 47 patients with glottic lesions between January 1, 2018, and August 31, 2024. This dataset includes 308 image-text pairs and strictly follows the principle of isolation from the training data, serving only for final performance evaluation. For detailed information on internal and external datasets, please refer to <xref ref-type="fig" rid="figure1">Figure 1</xref> and <xref ref-type="table" rid="table1">Table 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The specific form of data. GC: glottic carcinoma; VCD: vocal cord dysplasia.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e74902_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>The statistics of datasets.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Datasets</td><td align="left" valign="bottom">Internal dataset</td><td align="left" valign="bottom">External dataset</td></tr></thead><tbody><tr><td align="left" valign="top">Number of laryngoscopy reports</td><td align="char" char="." valign="top">404</td><td align="char" char="." valign="top">47</td></tr><tr><td align="left" valign="top">Number of VCD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="char" char="." valign="top">206</td><td align="char" char="." valign="top">22</td></tr><tr><td align="left" valign="top">Number of GC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="char" char="." valign="top">198</td><td align="char" char="." valign="top">25</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>VCD: vocal cord dysplasia.</p></fn><fn id="table1fn2"><p><sup>b</sup>GC: glottic carcinoma.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2"><title>Model Architecture</title><sec id="s2-2-1"><title>Overview</title><p><xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the overall architecture of our proposed VLMF-Net model, which consists of 3 main modules: the laryngoscopic image encoder, the clinical report encoder, and the laryngeal feature fusion module. Specifically, the laryngoscopic image encoder is responsible for extracting visual feature representations from laryngoscopic images, while the clinical report encoder captures textual feature representations from the patient&#x2019;s laryngoscopic examination findings. These features are then fused through the laryngeal feature fusion module. Finally, the fused features are passed through a fully connected layer to complete the classification task. The detailed implementation of each module in our proposed VLMF-Net model is as follows:</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The overall architecture of our proposed vision-language&#x2013;guided multimodal fusion network (VLMF-Net) model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e74902_fig02.png"/></fig></sec><sec id="s2-2-2"><title>Laryngoscopic Image Encoder</title><p>The laryngoscopic image encoder is an adapted version of a pretrained ViT [<xref ref-type="bibr" rid="ref22">22</xref>] model. Considering that a single ViT model may introduce cross-modal discrepancies in multimodal tasks, potentially affecting model accuracy, we incorporated an additional Q-Former [<xref ref-type="bibr" rid="ref25">25</xref>] module into the ViT model to mitigate the impact of modality differences and bridge the gap between image and text features. Q-Former is a trainable module based on Transformer that extracts and condenses visual features through alternating stacking of self-attention and cross-attention. Specifically, we first use ViT to extract features from the image, and then we input the extracted image features into a Q-Former module with frozen parameters to reduce modal differences.</p><p>Formally, let <inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> denote the image encoder and <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> denote the Q-Former module. Given an image <inline-formula><mml:math id="ieqn3"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, first resize the image to 224&#x00D7;224&#x00D7;3, then the image feature <inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is obtained as follows:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>v</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mi>q</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mtext>enc</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mtext>enc</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>;</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mi>q</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn5"><mml:msub><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn6"><mml:msub><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are the weight parameters of the image encoder and the Q-Former module, the shape of <inline-formula><mml:math id="ieqn7"><mml:msub><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is 1&#x00D7;1024. It is worth noting that in terms of the weights of the pretrained model, we adopt the transfer learning strategy: we directly use the pretrained weights of the BLIP-2 [<xref ref-type="bibr" rid="ref25">25</xref>] model as the weight parameters of the laryngoscope image encoder, because a large number of studies have demonstrated the effectiveness of BLIP-2 in downstream tasks [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. Such operations can ensure that the extracted image features and text features are in the same scale space, thus effectively avoiding catastrophic problems caused by modal differences.</p></sec><sec id="s2-2-3"><title>Clinical Report Encoder</title><p>In the clinical report encoder module, we use LLaMA3 [<xref ref-type="bibr" rid="ref23">23</xref>], an advanced large language model, as the text feature extractor to obtain textual feature representations from the patient&#x2019;s laryngoscopic examination findings. This model is renowned for its exceptional ability to understand long-form text.</p><p>Formally, let <inline-formula><mml:math id="ieqn8"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>L</mml:mi><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>M</mml:mi><mml:mi>A</mml:mi><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> denote the clinical report encoder function. Given a clinical report <inline-formula><mml:math id="ieqn9"><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></inline-formula> the process of obtaining the text feature <inline-formula><mml:math id="ieqn10"><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> can be formulated as follows:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mtext>LLaMa3</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>r</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">&#x03B8;</mml:mi><mml:mi>q</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn11"><mml:msub><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>represents the weight parameters of the clinical report encoder, the shape of <inline-formula><mml:math id="ieqn12"><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is 1&#x00D7;4096. Notably, we adopt the same transfer learning strategy for the clinical report encoder&#x2019;s weight parameters as we did for the laryngoscopic image encoder. Furthermore, considering that the text in our dataset is in Chinese, we use a fine-tuned LLaMA3 model trained on a Chinese dataset as the encoder&#x2019;s weight source [<xref ref-type="bibr" rid="ref24">24</xref>]. This approach effectively addresses the original LLaMA3 model&#x2019;s limitations in Chinese language understanding.</p></sec><sec id="s2-2-4"><title>Laryngeal Feature Fusion</title><p>In the 2 modules described above, we obtained image and text features. To effectively fuse these 2 modalities, we introduced the laryngeal feature fusion module, which aligns, maps, and integrates the features from both modalities. Specifically, we first use the vision projector <inline-formula><mml:math id="ieqn13"><mml:mi> </mml:mi><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>v</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> and the text projector <inline-formula><mml:math id="ieqn14"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> to map the image feature <inline-formula><mml:math id="ieqn15"><mml:msub><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and text feature <inline-formula><mml:math id="ieqn16"><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> into a unified feature space:</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msubsup><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mtext>vp</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mtext>vp</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mtext>tp</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mtext>tp</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Where <inline-formula><mml:math id="ieqn17"><mml:msubsup><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn18"><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> represent the mapped image and text features (their shapes are 1&#x00D7;512 and 1&#x00D7;2048), respectively, and <inline-formula><mml:math id="ieqn19"><mml:msub><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>v</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn20"><mml:msub><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are the learnable parameters of the projectors <inline-formula><mml:math id="ieqn21"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>v</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn22"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> .</p><p>Next, we apply L2 normalization to both features to ensure consistency, obtaining:</p><p><inline-formula><mml:math id="ieqn23"><mml:msubsup><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi><mml:mi>`</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mo>|</mml:mo><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:math></inline-formula>and <inline-formula><mml:math id="ieqn24"><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi><mml:mi>`</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msubsup><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:math></inline-formula>. Then<inline-formula><mml:math id="ieqn25"><mml:msubsup><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi><mml:mi>`</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and text feature<inline-formula><mml:math id="ieqn26"><mml:mi> </mml:mi><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi><mml:mi>`</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> along the feature dimension to form a vision-language joint representation, denoted as:<inline-formula><mml:math id="ieqn27"><mml:mi> </mml:mi><mml:msub><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi><mml:mi>`</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi><mml:mi>`</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfenced></mml:math></inline-formula>. This joint feature is then fed into a classifier, which consists of multiple fully connected layers, dropout layers, and ReLU activation functions. The classification process is formulated as:</p><disp-formula id="E5"><label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mtext>fc</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mtext>fc</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn28"><mml:msub><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the learnable parameters of the classifier.</p></sec></sec><sec id="s2-3"><title>Model Training</title><p>The internal dataset is divided at the patient level into training, validation, and test sets in a ratio of 8:1:1, strictly following the principle of data isolation. Regarding the loss function during training, we use the cross-entropy loss function <inline-formula><mml:math id="ieqn29"><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="script">c</mml:mi><mml:mi mathvariant="script">e</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and the contrastive loss function<inline-formula><mml:math id="ieqn30"><mml:mi> </mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="script">c</mml:mi><mml:mi mathvariant="script">t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></inline-formula> which are formulated as follows:</p><disp-formula id="E6"><label>(6)</label><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mi class="mathcal" mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E7"><label>(7)</label><mml:math id="eqn7"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mi class="mathcal" mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mfrac><mml:mrow><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mn mathvariant="bold">1</mml:mn></mml:mrow><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mi>k</mml:mi><mml:mo>&#x2260;</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:msub><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>In <inline-formula><mml:math id="ieqn31"><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="script">c</mml:mi><mml:mi mathvariant="script">e</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <italic>C</italic> represents the total number of classes,  <inline-formula><mml:math id="ieqn32"><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the ground-truth class label in one-hot encoding, and <inline-formula><mml:math id="ieqn33"><mml:mover accent="true"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula> is the predicted probability for class <italic>I</italic>. This loss function aims to minimize the difference between the model&#x2019;s predicted distribution and the true distribution, thereby guiding parameter updates and optimization. In <inline-formula><mml:math id="ieqn34"><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="script">c</mml:mi><mml:mi mathvariant="script">t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math id="ieqn35"><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mo>(</mml:mo><mml:mo>&#x00B7;</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> represents cosine similarity,<inline-formula><mml:math id="ieqn36"><mml:mi mathvariant="normal"> </mml:mi><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn37"><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are positive sample pairs, <inline-formula><mml:math id="ieqn38"><mml:mi>&#x03C4;</mml:mi></mml:math></inline-formula> is the temperature parameter, and <italic>N</italic> is the number of negative samples. The final loss function <inline-formula><mml:math id="ieqn39"><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is formulated as follows:</p><disp-formula id="E8"> <label>(8)</label><mml:math id="eqn8"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mi class="mathcal" mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi class="mathcal" mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>c</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mrow><mml:mi class="mathcal" mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where c=0.1. All training processes and experiments are conducted on a dedicated server equipped with 4 NVIDIA A6000 GPUs with a total of 196GB of VRAM. The system runs on Ubuntu 20.04.5 LTS, and the model is implemented using PyTorch 3.9.0 and Scikit-learn 1.3.1. In this study, we use the AdamW optimizer to optimize VLMF-Net, with an initial learning rate set to 0.00001. A warm-up strategy and cosine learning rate scheduling are adopted to dynamically adjust the learning rate. VLMF-Net is trained for a total of 80 epochs.</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study was approved by the Ethics Committee of the First Affiliated Hospital of Sun Yat-sen University (approval number [2023]755&#x2010;1). Informed consent was waived by the institutional review boards of all participating hospitals due to the study&#x2019;s retrospective design. We implemented stringent measures to protect the privacy of all participants by anonymizing all collected data to remove any personally identifiable information. Throughout the manuscript preparation, we diligently avoided disclosing any details that could reveal the identity of participants. Furthermore, no compensation or indemnity was required as the study did not cause any losses to participants beyond the necessary clinical diagnostic and therapeutic measures.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Result of Our Model</title><p>To demonstrate the effectiveness and advantages of our proposed VLMF-Net, we selected 4 classic models as baseline models and performed comparisons on 2 datasets. The 4 models include vision-and-language transformer (VILT) [<xref ref-type="bibr" rid="ref29">29</xref>], contrastive language-image pretraining (CLIP) [<xref ref-type="bibr" rid="ref30">30</xref>], bootstrapping language-image pretraining with frozen image encoders and large language models (BLIP-2) [<xref ref-type="bibr" rid="ref25">25</xref>], and a large-scale image and noisy-text embedding (ALIGN) [<xref ref-type="bibr" rid="ref31">31</xref>], and the 2 datasets refer to the internal and external datasets mentioned earlier. <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref> present the average results of 5 trials for the 4 baseline models and our proposed model on the internal and external datasets, respectively. <xref ref-type="fig" rid="figure3">Figure 3</xref> shows the receiver operating characteristic curves and corresponding area under the curve values of different models on internal and external datasets. As shown in <xref ref-type="table" rid="table2">Table 2</xref>, on the internal dataset, our method demonstrates significant advantages, achieving the following evaluation metrics: accuracy (0.776), precision (0.820), and <italic>F</italic><sub>1</sub>-score (0.776). Notably, compared to the second-best model, our method achieves significant improvements of 0.061, 0.032, and 0.046 in accuracy, precision, and <italic>F</italic><sub>1</sub>-score, respectively. In terms of class-wise recall, VLMF-Net achieves recall rates of 0.754 and 0.803 for VCD and GC, respectively. Compared to other models, VLMF-Net shows significant improvements across multiple evaluation metrics, indicating its superior ability in recognizing glottic cancer. In addition, as shown in <xref ref-type="table" rid="table3">Table 3</xref>, on the external dataset, our method also demonstrates significant advantages, achieving accuracy (0.739), precision (0.828), and <italic>F</italic><sub>1</sub>-score (0.737), with improvements of 0.046, 0.053, and 0.046 over the second-best model, respectively. In terms of class-wise recall, our model achieves recall rates of 0.701 and 0.793 for VCD and GC, respectively. Compared to other models, there is also a significant improvement, which further demonstrates that VLMF-Net possesses excellent generalization ability and robustness. Moreover, as shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, our model demonstrates significant advantages in area under the curve metrics on both internal and external datasets. On the internal dataset, it outperforms the second-best model by 0.026, and on the external dataset by 0.012.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparison with other multimodal models on an internal dataset.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Methods</td><td align="left" valign="bottom" colspan="3">Overall results</td><td align="left" valign="bottom" colspan="2">Recall of different classes</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Accuracy, mean (SD)</td><td align="left" valign="bottom">Precision, mean (SD)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score, mean (SD)</td><td align="left" valign="bottom">VCD<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, mean (SD)</td><td align="left" valign="bottom">GC<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">VILT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.643 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.677 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.643 (0.01<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.632 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.656 (0.01<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">CLIP<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">0.705 (0.01<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.788 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.703 (0.01<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.672 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.750 (0.01<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">BLIP-2<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">0.715 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.770 (0.03<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.714 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.694 (0.04<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.742 (0.03<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">ALIGN<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top">0.673 (0.03<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.707 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.673 (0.04<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.660 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">0.688 (0.02<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">VLMF-Net<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top">0.776 (0.01)</td><td align="left" valign="top">0.820 (0.02)</td><td align="left" valign="top">0.776 (0.01)</td><td align="left" valign="top">0.754 (0.02)</td><td align="left" valign="top">0.803 (0.01)</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>VCD: vocal cord dysplasia.</p></fn><fn id="table2fn2"><p><sup>b</sup>GC: glottic carcinoma.</p></fn><fn id="table2fn3"><p><sup>c</sup>VILT: vision-and-language transformer.</p></fn><fn id="table2fn4"><p><sup>d</sup><italic>P</italic>&#x003C;.001.</p></fn><fn id="table2fn5"><p><sup>e</sup>CLIP: contrastive language-image pretraining.</p></fn><fn id="table2fn6"><p><sup>f</sup>BLIP-2: bootstrapping language-image pretraining with frozen image encoders and large language models.</p></fn><fn id="table2fn7"><p><sup>g</sup>ALIGN: a large-scale image and noisy-text embedding.</p></fn><fn id="table2fn8"><p><sup>h</sup>VLMF-Net: vision-language guided multimodal fusion network.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison with other multimodal models on an external dataset.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Methods</td><td align="left" valign="bottom" colspan="3">Overall results</td><td align="left" valign="bottom" colspan="2">Recall of different classes</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Accuracy, mean (SD)</td><td align="left" valign="bottom">Precision, mean (SD)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score, mean (SD)</td><td align="left" valign="bottom">VCD<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>, mean (SD)</td><td align="left" valign="bottom">GC<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">VILT<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">0.631 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.663 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.630 (0.01<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.633 (0.01<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.628 (0.01<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">CLIP<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">0.686 (0.01<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.748 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.685 (0.01<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.670 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.708 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">BLIP-2<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">0.693 (0.03<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.775 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.691 (0.03<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.669 (0.03<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.726 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">ALIGN<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="left" valign="top">0.647 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.680 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.647 (0.03<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.642 (0.02<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">0.653 (0.03<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">VLMF-Net<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">0.739 (0.02)</td><td align="left" valign="top">0.828 (0.02)</td><td align="left" valign="top">0.737 (0.02)</td><td align="left" valign="top">0.701 (0.03)</td><td align="left" valign="top">0.793 (0.02)</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>VCD: vocal cord dysplasia.</p></fn><fn id="table3fn2"><p><sup>b</sup>GC: glottic carcinoma.</p></fn><fn id="table3fn3"><p><sup>c</sup>VILT: vision-and-language transformer.</p></fn><fn id="table3fn4"><p><sup>d</sup><italic>P</italic> value &#x003C;.001.</p></fn><fn id="table3fn5"><p><sup>e</sup>CLIP: contrastive language-image pretraining.</p></fn><fn id="table3fn6"><p><sup>f</sup>BLIP-2: bootstrapping language-image pretraining with frozen image encoders and large language models.</p></fn><fn id="table3fn7"><p><sup>g</sup>ALIGN: a large-scale image and noisy-text embedding.</p></fn><fn id="table3fn8"><p><sup>h</sup>VLMF-Net: vision-language guided multimodal fusion network.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>The ROC curves for different models. ALIGN: a large-scale image and noisy-text embedding; BLIP-2: bootstrapping language-image pretraining with frozen image encoders and large language models; CLIP: contrastive language-image pretraining; ROC: receiver operating characteristic; VILT: vision-and-language transformer; VLMF-Net: vision-language guided multimodal fusion network.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e74902_fig03.png"/></fig></sec><sec id="s3-2"><title>Ablation Studies</title><p>To validate the effectiveness of our multimodal approach for early diagnosis of GC, we designed an ablation study to systematically evaluate the performance differences between models using single-modal and multimodal inputs: (1) M1: a single-modal model using only laryngoscopy images; (2) M2: a single-modal model using only laryngoscopy diagnostic reports; (3) M3: a multimodal model combining images and text, but without Q-Former. (4) M4: a multimodal model combining images and text with Q-Former. The experimental results, as shown in <xref ref-type="table" rid="table4">Table 4</xref>, indicate that the multimodal model M4 significantly outperforms both single-modal models across all evaluation metrics. Specifically, compared to the best single-modal model, M4 achieves improvements of 0.098 in accuracy (0.776), 0.106 in precision (0.820), 0.100 in recall (0.779), and 0.098 in <italic>F</italic><sub>1</sub>-score (0.776). This demonstrates that the multimodal data fusion strategy effectively integrates visual features and textual semantic information, significantly enhancing the diagnostic performance of the model.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Ablation study on vision-language&#x2013;guided multimodal fusion network.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variants</td><td align="left" valign="bottom">Image</td><td align="left" valign="bottom">Report</td><td align="left" valign="bottom">Q-Former</td><td align="left" valign="bottom">Accuracy, mean (SD)</td><td align="left" valign="bottom">Precision, mean (SD)</td><td align="left" valign="bottom">Recall, mean (SD)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">M1</td><td align="left" valign="top">&#x221A;</td><td align="char" char="plusmn" valign="top">0.678 (0.02)</td><td align="char" char="plusmn" valign="top">0.714 (0.03)</td><td align="char" char="plusmn" valign="top">0.679 (0.03)</td><td align="char" char="plusmn" valign="top">0.678 (0.03)</td></tr><tr><td align="left" valign="top">M2</td><td align="left" valign="top">&#x221A;</td><td align="char" char="plusmn" valign="top">0.673 (0.01)</td><td align="char" char="plusmn" valign="top">0.711 (0.02)</td><td align="char" char="plusmn" valign="top">0.675 (0.01)</td><td align="char" char="plusmn" valign="top">0.673 (0.01)</td></tr><tr><td align="left" valign="top">M3</td><td align="left" valign="top">&#x221A;</td><td align="left" valign="top">&#x221A;</td><td align="char" char="plusmn" valign="top">0.722 (0.02)</td><td align="char" char="plusmn" valign="top">0.708 (0.01)</td><td align="char" char="plusmn" valign="top">0.723 (0.01)</td><td align="char" char="plusmn" valign="top">0.722 (0.03)</td></tr><tr><td align="left" valign="top">M4</td><td align="left" valign="top">&#x221A;</td><td align="left" valign="top">&#x221A;</td><td align="left" valign="top">&#x221A;</td><td align="char" char="plusmn" valign="top">0.776 (0.01)</td><td align="char" char="plusmn" valign="top">0.820 (0.02)</td><td align="char" char="plusmn" valign="top">0.779 (0.02)</td><td align="char" char="plusmn" valign="top">0.776 (0.01)</td></tr></tbody></table></table-wrap><p>In addition, to verify the effectiveness of the Q-Former module, we designed ablation experiments related to the Q-Former module. The experimental results are shown in <xref ref-type="table" rid="table4">Table 4</xref>. M4 outperforms M3 in all indicators. Specifically, M4&#x2019;s accuracy (0.776) increased by 0.054, precision (0.820) increased by 0.112, recall (0.779) increased by 0.056, and <italic>F</italic><sub>1</sub>-score (0.776) increased by 0.054. This indicates that the Q-Former module effectively reduces the impact of cross-modal differences.</p></sec><sec id="s3-3"><title>Visualization for Model Prediction</title><p>To enhance the interpretability of the VLMF-Net model&#x2019;s predictions, we used the Grad-CAM (Gradient-Weighted Class Activation Mapping) [<xref ref-type="bibr" rid="ref32">32</xref>] algorithm to generate class activation heatmaps. The visualization results are shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>. In the heatmap, we can see that VLMF-Net focuses on the lesion area of the patient when analyzing the laryngoscopy image. This indicates that the model is able to correctly classify based on the features of the lesion area, enabling early diagnosis of GC.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>The heatmaps generated by the vision-language guided multimodal fusion network (VLMF-Net) model on patient laryngoscopic images, where (<bold>A</bold>) represents glottic carcinoma and (<bold>B</bold>) and (<bold>C</bold>) represent vocal cord dysplasia.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e74902_fig04.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Main Findings</title><p>In our study, we developed a novel model for the early diagnosis of GC named VLMF-Net, which leverages multimodal fusion technology guided by vision-language information. When tested on an internal dataset, VLMF-Net achieved accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score of 0.776, 0.820, 0.779, and 0.776, respectively, outperforming all baseline methods. These results demonstrate the feasibility and effectiveness of VLMF-Net in the early diagnosis of GC. Ablation studies further reveal that VLMF-Net significantly outperforms unimodal models. By integrating both laryngoscopic images and clinical text reports, VLMF-Net captures complementary diagnostic information, thus mitigating the risk of information loss inherent in single-modality systems. Notably, the textual modality meaningfully guides the extraction of image features; for instance, textual cues explicitly describing the lesion&#x2019;s location (eg, &#x201C;anterior-middle segment of the left vocal cord&#x201D;) can help the model focus on the relevant visual regions. This form of cross-modal interaction may enhance the model&#x2019;s ability to detect subtle pathological patterns that might otherwise be overlooked.</p><p>Moreover, Grad-CAM&#x2013;based visualizations demonstrate that VLMF-Net consistently attends to clinically significant lesion areas. These attention heatmaps show strong alignment between the model&#x2019;s focus and expert-defined pathological regions, enhancing interpretability. Such interpretability is crucial in clinical contexts, where transparency in decision-making processes fosters trust and facilitates integration of AI systems into diagnostic workflows.</p><p>Finally, these findings validate the design philosophy of VLMF-Net and highlight the broader potential of multimodal fusion strategies in medical AI. In particular, the ability to synthesize visual and contextual clinical information allows for more robust and informed diagnostic decisions. This work not only advances the state of the art in GC diagnosis but also lays a foundation for extending vision-language multimodal techniques to other complex diagnostic tasks where rich multimodal data is available.</p></sec><sec id="s4-2"><title>Deep Learning Challenges in Early GC Diagnosis</title><p>Early GC lesions are usually small and exhibit complex morphological characteristics, making it challenging for deep learning models to capture fine-grained lesion features. As a result, important details related to the lesions may be missed, significantly affecting the model&#x2019;s diagnostic accuracy. Furthermore, most existing mainstream models rely solely on laryngoscopic images as input. While these models have achieved some progress in a unimodal setting, they overlook information from other modalities, which is often beyond the reach of image-based models [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. This limitation constrains the model&#x2019;s comprehensive understanding of lesion characteristics and ultimately affects diagnostic accuracy.</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>To the best of our knowledge, this is the first study to apply multimodal techniques to the early diagnosis of GC. In our model, we fully utilize patients&#x2019; laryngoscopy reports and laryngoscopic images to extract relevant information about GC, enabling the model to comprehensively understand the patient&#x2019;s condition and thereby improve its performance. However, this study still has some limitations. First, the VLMF-Net, based on a pretrained large language model, relies heavily on powerful computational resources, particularly high-performance GPUs, during training, which may result in slower training and inference speeds. Second, the data used in this study underwent quality checks, but in real-world scenarios, more complex situations may arise, such as poor image clarity due to imaging devices or challenging shooting angles, which could affect the model&#x2019;s diagnostic accuracy.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In this paper, we propose a VLMF-Net for the early diagnosis of GC. Extensive experiments on 2 datasets demonstrate that VLMF-Net achieves superior accuracy and robustness, effectively addressing the challenges of early GC diagnosis.</p></sec></sec></body><back><ack><p>We note that a shorter conference version of this paper appeared in the 2024 China Health Information Processing Conference (CHIP 2024). In our initial conference paper, we did not investigate whether the vision-language guided multimodal fusion network (VLMF-Net) model maintains its effectiveness across different datasets and consistently outperforms baseline models. This manuscript addresses these issues while further optimizing the originally proposed model through approaches such as hybrid loss function training. In addition, we use the Grad-CAM (Gradient-Weighted Class Activation Mapping) methodology to analyze the interpretability of VLMF-Net. This work is partially supported by the National Natural Science Foundation of China (62473267), the Basic and Applied Basic Research Project of Guangdong Province (2022B1515130009, 2025A1515011614), the Guangzhou Science and Technology Planning Project (No.2025B03J0019), the Special Subject on Agriculture and Social Development, Key Research and Development Plan in Guangzhou (2023B03J0172), and the Natural Science Foundation of Top Talent of SZTU (GDRC202318).</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ALIGN</term><def><p>a large-scale image and noisy-text embedding</p></def></def-item><def-item><term id="abb2">BLIP-2</term><def><p>bootstrapping language-image pretraining with frozen image encoders and large language models</p></def></def-item><def-item><term id="abb3">CLIP</term><def><p>contrastive language-image pretraining</p></def></def-item><def-item><term id="abb4">GC</term><def><p>glottic carcinoma</p></def></def-item><def-item><term id="abb5">Grad-CAM</term><def><p>Gradient-Weighted Class Activation Mapping</p></def></def-item><def-item><term id="abb6">LLaMa</term><def><p>Large Language Model Meta AI</p></def></def-item><def-item><term id="abb7">VCD</term><def><p>vocal cord dysplasia</p></def></def-item><def-item><term id="abb8">ViT</term><def><p>vision transformer</p></def></def-item><def-item><term id="abb9">VLIT</term><def><p>vision-and-language transformer</p></def></def-item><def-item><term id="abb10">VLMF-Net</term><def><p>vision-language guided multimodal fusion network</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Steuer</surname><given-names>CE</given-names> </name><name name-style="western"><surname>El-Deiry</surname><given-names>M</given-names> </name><name name-style="western"><surname>Parks</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Higgins</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Saba</surname><given-names>NF</given-names> </name></person-group><article-title>An update on larynx cancer</article-title><source>CA Cancer J Clin</source><year>2017</year><month>01</month><volume>67</volume><issue>1</issue><fpage>31</fpage><lpage>50</lpage><pub-id pub-id-type="doi">10.3322/caac.21386</pub-id><pub-id pub-id-type="medline">27898173</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bray</surname><given-names>F</given-names> </name><name name-style="western"><surname>Laversanne</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sung</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Global cancer statistics 2022: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title><source>CA Cancer J Clin</source><year>2024</year><volume>74</volume><issue>3</issue><fpage>229</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.3322/caac.21834</pub-id><pub-id pub-id-type="medline">38572751</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Muller</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>AY</given-names> </name><etal/></person-group><article-title>Patterns of extralaryngeal spread of laryngeal cancer: thyroid cartilage penetration occurs in a minority of patients with extralaryngeal spread of laryngeal squamous cell cancers</article-title><source>Cancer</source><year>2011</year><month>11</month><day>15</day><volume>117</volume><issue>22</issue><fpage>5047</fpage><lpage>5051</lpage><pub-id pub-id-type="doi">10.1002/cncr.26130</pub-id><pub-id pub-id-type="medline">21523761</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>LeBlanc</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mehta</surname><given-names>V</given-names> </name><name name-style="western"><surname>Mills</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ampil</surname><given-names>F</given-names> </name><name name-style="western"><surname>Nathan</surname><given-names>CAO</given-names> </name></person-group><article-title>Improvements in survival and disparities for advanced-stage laryngeal cancer</article-title><source>JAMA Otolaryngol Head Neck Surg</source><year>2015</year><month>02</month><volume>141</volume><issue>2</issue><fpage>169</fpage><lpage>173</lpage><pub-id pub-id-type="doi">10.1001/jamaoto.2014.2998</pub-id><pub-id pub-id-type="medline">25429594</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cosway</surname><given-names>B</given-names> </name><name name-style="western"><surname>Drinnan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Paleri</surname><given-names>V</given-names> </name></person-group><article-title>Narrow band imaging for the diagnosis of head and neck squamous cell carcinoma: a systematic review</article-title><source>Head Neck</source><year>2016</year><month>04</month><volume>38 Suppl 1</volume><fpage>E2358</fpage><lpage>67</lpage><pub-id pub-id-type="doi">10.1002/hed.24300</pub-id><pub-id pub-id-type="medline">26891200</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>You</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Han</surname><given-names>B</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Vocal cord leukoplakia classification using deep learning models in white light and narrow band imaging endoscopy images</article-title><source>Head Neck</source><year>2023</year><month>12</month><volume>45</volume><issue>12</issue><fpage>3129</fpage><lpage>3145</lpage><pub-id pub-id-type="doi">10.1002/hed.27543</pub-id><pub-id pub-id-type="medline">37837264</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kraft</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fostiropoulos</surname><given-names>K</given-names> </name><name name-style="western"><surname>G&#x00FC;rtler</surname><given-names>N</given-names> </name><name name-style="western"><surname>Arnoux</surname><given-names>A</given-names> </name><name name-style="western"><surname>Davaris</surname><given-names>N</given-names> </name><name name-style="western"><surname>Arens</surname><given-names>C</given-names> </name></person-group><article-title>Value of narrow band imaging in the early diagnosis of laryngeal cancer</article-title><source>Head Neck</source><year>2016</year><month>01</month><volume>38</volume><issue>1</issue><fpage>15</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.1002/hed.23838</pub-id><pub-id pub-id-type="medline">24995546</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>Q</given-names> </name><name name-style="western"><surname>He</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Vocal cord lesions classification based on deep convolutional neural network and transfer learning</article-title><source>Med Phys</source><year>2022</year><month>01</month><volume>49</volume><issue>1</issue><fpage>432</fpage><lpage>442</lpage><pub-id pub-id-type="doi">10.1002/mp.15371</pub-id><pub-id pub-id-type="medline">34813114</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Hulst</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Kroon</surname><given-names>W</given-names> </name><name name-style="western"><surname>van der Linden</surname><given-names>ES</given-names> </name><etal/></person-group><article-title>Grade of dysplasia and malignant transformation in adults with premalignant laryngeal lesions</article-title><source>Head Neck</source><year>2016</year><month>04</month><volume>38 Suppl 1</volume><issue>S1</issue><fpage>E2284</fpage><lpage>90</lpage><pub-id pub-id-type="doi">10.1002/hed.24185</pub-id><pub-id pub-id-type="medline">26268427</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ni</surname><given-names>XG</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>JQ</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>QQ</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>BG</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>GQ</given-names> </name></person-group><article-title>Diagnosis of vocal cord leukoplakia: the role of a novel narrow band imaging endoscopic classification</article-title><source>Laryngoscope</source><year>2019</year><month>02</month><volume>129</volume><issue>2</issue><fpage>429</fpage><lpage>434</lpage><pub-id pub-id-type="doi">10.1002/lary.27346</pub-id><pub-id pub-id-type="medline">30229933</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mannelli</surname><given-names>G</given-names> </name><name name-style="western"><surname>Cecconi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>O</given-names> </name></person-group><article-title>Laryngeal preneoplastic lesions and cancer: challenging diagnosis. Qualitative literature review and meta-analysis</article-title><source>Crit Rev Oncol Hematol</source><year>2016</year><month>10</month><volume>106</volume><fpage>64</fpage><lpage>90</lpage><pub-id pub-id-type="doi">10.1016/j.critrevonc.2016.07.004</pub-id><pub-id pub-id-type="medline">27637353</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>PCI</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>R</given-names> </name></person-group><article-title>Identifying kidney stone risk factors through patient experiences with a large language model: text analysis and empirical study</article-title><source>J Med Internet Res</source><year>2025</year><month>05</month><day>22</day><volume>27</volume><fpage>e66365</fpage><pub-id pub-id-type="doi">10.2196/66365</pub-id><pub-id pub-id-type="medline">40403294</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sahoo</surname><given-names>PK</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>S</given-names> </name><name name-style="western"><surname>Panigrahi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bhoi</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Barsocchi</surname><given-names>P</given-names> </name></person-group><article-title>An improvised deep-learning-based mask R-CNN model for laryngeal cancer detection using CT images</article-title><source>Sensors (Basel)</source><year>2022</year><month>11</month><day>15</day><volume>22</volume><issue>22</issue><fpage>8834</fpage><pub-id pub-id-type="doi">10.3390/s22228834</pub-id><pub-id pub-id-type="medline">36433430</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bensoussan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Vanstrum</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Johns</surname><given-names>MM</given-names>  <suffix>III</suffix></name><name name-style="western"><surname>Rameau</surname><given-names>A</given-names> </name></person-group><article-title>Artificial intelligence and laryngeal cancer: from screening to prognosis: a state of the art review</article-title><source>Otolaryngol Head Neck Surg</source><year>2023</year><month>03</month><volume>168</volume><issue>3</issue><fpage>319</fpage><lpage>329</lpage><pub-id pub-id-type="doi">10.1177/01945998221110839</pub-id><pub-id pub-id-type="medline">35787073</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>R</given-names> </name></person-group><article-title>Diagnosis of ulcerative colitis from endoscopic images based on deep learning</article-title><source>Biomed Signal Process Control</source><year>2022</year><month>03</month><volume>73</volume><fpage>103443</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2021.103443</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>L</given-names> </name></person-group><article-title>MTANet: multitask-aware network with hierarchical multimodal fusion for RGB-T urban scene understanding</article-title><source>IEEE Trans Intell Veh</source><year>2022</year><month>05</month><volume>8</volume><issue>1</issue><fpage>48</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.1109/TIV.2022.3164899</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>KN</given-names> </name><name name-style="western"><surname>Zhuang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ran</surname><given-names>QY</given-names> </name><etal/></person-group><article-title>DLGNet: a dual-branch lesion-aware network with the supervised gaussian mixture model for colon lesions classification in colonoscopy images</article-title><source>Med Image Anal</source><year>2023</year><month>07</month><volume>87</volume><fpage>102832</fpage><pub-id pub-id-type="doi">10.1016/j.media.2023.102832</pub-id><pub-id pub-id-type="medline">37148864</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cui</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name></person-group><article-title>REDFormer: radar enlightens the darkness of camera perception with transformers</article-title><source>IEEE Trans Intell Veh</source><year>2023</year><month>11</month><day>6</day><volume>9</volume><issue>1</issue><fpage>1358</fpage><lpage>1368</lpage><pub-id pub-id-type="doi">10.1109/TIV.2023.3329708</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>W</given-names> </name></person-group><article-title>SAM-FNet: SAM-guided fusion network for laryngo-pharyngeal tumor detection</article-title><conf-name>2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name><conf-date>Dec 3-6, 2024</conf-date><conf-loc>Lisbon, Portugal</conf-loc><pub-id pub-id-type="doi">10.1109/BIBM62325.2024.10822832</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kang</surname><given-names>EYC</given-names> </name><name name-style="western"><surname>Yeung</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YL</given-names> </name><etal/></person-group><article-title>A multimodal imaging-based deep learning model for detecting treatment-requiring retinal vascular diseases: model development and validation study</article-title><source>JMIR Med Inform</source><year>2021</year><month>05</month><day>31</day><volume>9</volume><issue>5</issue><fpage>e28868</fpage><pub-id pub-id-type="doi">10.2196/28868</pub-id><pub-id pub-id-type="medline">34057419</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noda</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yoshimura</surname><given-names>H</given-names> </name><name name-style="western"><surname>Okubo</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Feasibility of multimodal artificial intelligence using GPT-4 vision for the classification of middle ear disease: qualitative study and validation</article-title><source>JMIR AI</source><year>2024</year><month>05</month><day>31</day><volume>3</volume><issue>1</issue><fpage>e58342</fpage><pub-id pub-id-type="doi">10.2196/58342</pub-id><pub-id pub-id-type="medline">38875669</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Naseer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hayat</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zamir</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>FS</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>M</given-names> </name></person-group><article-title>Transformers in vision: a survey</article-title><source>ACM Comput Surv</source><year>2022</year><month>01</month><day>31</day><volume>54</volume><issue>10s</issue><fpage>1</fpage><lpage>41</lpage><pub-id pub-id-type="doi">10.1145/3505244</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kadian</surname><given-names>A</given-names> </name><name name-style="western"><surname>Al-Dahle</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Cui</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>X</given-names> </name></person-group><article-title>Efficient and effective text encoding for Chinese LLaMA and Alpaca</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 17, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.08177</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><name name-style="western"><surname>Savarese</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hoi</surname><given-names>S</given-names> </name></person-group><article-title>BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 23, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2301.12597</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>C</given-names> </name><name name-style="western"><surname>Jang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name></person-group><article-title>Personalizing text-to-image generation with visual prompts using BLIP-2</article-title><access-date>2025-09-17</access-date><conf-name>Eleventh International Conference on Learning Representations (ICLR 2023)</conf-name><conf-date>May 1-5, 2023</conf-date><conf-loc>Kigali, Rwanda</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf?id=7wxCMgdj5F">https://openreview.net/pdf?id=7wxCMgdj5F</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gadre</surname><given-names>SY</given-names> </name><name name-style="western"><surname>Ilharco</surname><given-names>G</given-names> </name><name name-style="western"><surname>Oh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>L</given-names> </name></person-group><article-title>Improving multimodal datasets with image captioning</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.10350</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Haydarov</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Elhoseiny</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT asks, BLIP-2 answers: automatic questioning towards enriched visual descriptions</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 12, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.06594</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>W</given-names> </name><name name-style="western"><surname>Son</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>I</given-names> </name></person-group><article-title>ViLT: vision-and-language transformer without convolution or region supervision</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 5, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2102.03334</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Hallacy</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ramesh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Goh</surname><given-names>G</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Learning transferable visual models from natural language supervision</article-title><access-date>2025-09-17</access-date><conf-name>38th International Conference on Machine Learning (ICML 2021)</conf-name><conf-date>Jul 18-24, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v139/radford21a/radford21a.pdf">https://proceedings.mlr.press/v139/radford21a/radford21a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jia</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Scaling up visual and vision-language representation learning with noisy text supervision</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 11, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2102.05918</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Selvaraju</surname><given-names>RR</given-names> </name><name name-style="western"><surname>Cogswell</surname><given-names>M</given-names> </name><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vedantam</surname><given-names>R</given-names> </name><name name-style="western"><surname>Parikh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Batra</surname><given-names>D</given-names> </name></person-group><article-title>Grad-CAM: visual explanations from deep networks via gradient-based localization</article-title><conf-name>International Conference on Computer Vision (ICCV)</conf-name><conf-date>Oct 22-29, 2017</conf-date><conf-loc>Venice, Italy</conf-loc><pub-id pub-id-type="doi">10.1007/s11263-019-01228-7</pub-id></nlm-citation></ref></ref-list></back></article>