<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e68618</article-id><article-id pub-id-type="doi">10.2196/68618</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Automated Radiology Report Labeling in Chest X-Ray Pathologies: Development and Evaluation of a Large Language Model Framework</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Abdullah</surname><given-names>Abdullah</given-names></name><degrees>MCSE</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Kim</surname><given-names>Seong Tae</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Department of Computer Science and Engineering, Kyung Hee University</institution><addr-line>1732 Deogyeong-daero, Giheung-gu</addr-line><addr-line>Yongin</addr-line><addr-line>Gyeonggi-do</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Castonguay</surname><given-names>Alexandre</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gorrepati</surname><given-names>Leela Prasad</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chowdhury</surname><given-names>Shaika</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Yogeshappa</surname><given-names>Vedamurthy Gejjegondanahalli</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Khan</surname><given-names>Yelman</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Seong Tae Kim, PhD, Department of Computer Science and Engineering, Kyung Hee University, 1732 Deogyeong-daero, Giheung-gu, Yongin, Gyeonggi-do, 17104, Republic of Korea, 82 312013761; <email>st.kim@khu.ac.kr</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>28</day><month>3</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e68618</elocation-id><history><date date-type="received"><day>11</day><month>11</month><year>2024</year></date><date date-type="rev-recd"><day>17</day><month>01</month><year>2025</year></date><date date-type="accepted"><day>30</day><month>01</month><year>2025</year></date></history><copyright-statement>&#x00A9; Abdullah Abdullah, Seong Tae Kim. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 28.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e68618"/><abstract><sec><title>Background</title><p>Labeling unstructured radiology reports is crucial for creating structured datasets that facilitate downstream tasks, such as training large-scale medical imaging models. Current approaches typically rely on Bidirectional Encoder Representations from Transformers (BERT)-based methods or manual expert annotations, which have limitations in terms of scalability and performance.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the effectiveness of a generative pretrained transformer (GPT)-based large language model (LLM) in labeling radiology reports, comparing it with 2 existing methods, CheXbert and CheXpert, on a large chest X-ray dataset (MIMIC Chest X-ray [MIMIC-CXR]).</p></sec><sec sec-type="methods"><title>Methods</title><p>In this study, we introduce an LLM-based approach fine-tuned on expert-labeled radiology reports. Our model&#x2019;s performance was evaluated on 687 radiologist-labeled chest X-ray reports, comparing <italic>F</italic>1 scores across 14 thoracic pathologies. The performance of our LLM model was compared with the CheXbert and CheXpert models across positive, negative, and uncertainty extraction tasks. Paired <italic>t</italic> tests and Wilcoxon signed-rank tests were performed to evaluate the statistical significance of differences between model performances.</p></sec><sec sec-type="results"><title>Results</title><p>The GPT-based LLM model achieved an average <italic>F</italic>1 score of 0.9014 across all certainty levels, outperforming CheXpert (0.8864) and approaching CheXbert&#x2019;s performance (0.9047). For positive and negative certainty levels, our model scored 0.8708, surpassing CheXpert (0.8525) and closely matching CheXbert (0.8733). Statistically, paired <italic>t</italic> tests indicated no significant difference between our model and CheXbert (<italic>P</italic>=.35) but a significant improvement over CheXpert (<italic>P</italic>=.01). Wilcoxon signed-rank tests corroborated these findings, showing no significant difference between our model and CheXbert (<italic>P</italic>=.14) but confirming a significant difference with CheXpert (<italic>P</italic>=.005). The LLM also demonstrated superior performance for pathologies with longer and more complex descriptions, leveraging its extended context length.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The GPT-based LLM model demonstrates competitive performance compared with CheXbert and outperforms CheXpert in radiology report labeling. These findings suggest that LLMs are a promising alternative to traditional BERT-based architectures for this task, offering enhanced context understanding and eliminating the need for extensive feature engineering. Furthermore, with large context length LLM-based models are better suited for this task as compared with the small context length of BERT based models.</p></sec></abstract><kwd-group><kwd>large language model</kwd><kwd>generative pre-trained transformers</kwd><kwd>radiology report</kwd><kwd>labeling</kwd><kwd>BERT</kwd><kwd>thoracic pathologies</kwd><kwd>LLM</kwd><kwd>GPT</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Radiology reports consist of expert observations of the radiologist based on the Chest X-ray images of the patient. These reports consist of free-text and unstructured information in the form of long paragraphs. The extraction of labels from unstructured radiology reports is the task of radiology report labeling and it provides us structured information which can be used for many downstream tasks such as medical report generation and natural language explanation generation. It also enables training of large-scale medical imaging models [<xref ref-type="bibr" rid="ref1">1</xref>]. Previous works for labeling of radiology reports involve use of complicated feature engineering of medical domain knowledge [<xref ref-type="bibr" rid="ref2">2</xref>]. and Bidirectional Encoder Representations from Transformers (BERT) based approaches [<xref ref-type="bibr" rid="ref3">3</xref>]. Transformers have also demonstrated success in radiology report labeling [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, all these methods have limitations which hinders their adoption in the clinical setting. In case of methods which use complex feature engineering, these methods have leveraged manual annotation to shift the burden from feature engineering, requiring considerable time and expertise. Furthermore, these methods do not take advantage of existing feature-engineered labelers, which are state-of-the-art on many medical tasks. On the other hand for methods using BERT based models, the models are limited by the inherent limitations of BERT models such as their noncausal nature and limited context length. BERT-based models, despite their effectiveness in text classification tasks, have two key architectural limitations that constrain their performance in radiology report labeling. First, BERT&#x2019;s bidirectional nature focuses on context aggregation but lacks the ability to model causal relationships in sequential data. This noncausal nature can hinder its ability to fully capture the hierarchical and temporally dependent structure of radiology reports, where findings are often sequentially described. Second, BERT&#x2019;s limited input context length (typically 512 tokens) prevents it from effectively processing the long and detailed narratives commonly found in radiology reports. As a result, crucial information in extended texts may be truncated, leading to incomplete or suboptimal labeling. These limitations reduce the adaptability of BERT-based methods to real-world radiology settings, where comprehensive understanding of the entire report is often required.</p><p>Large language models (LLMs) such as Qwen address these challenges by offering extended context lengths (several thousand tokens), allowing the model to process full radiology reports without truncation. In addition, their ability to incorporate causal reasoning and handle instruction-based tasks makes them particularly suitable for medical labeling tasks, where nuanced and ambiguous language is prevalent. Although BERT based methods have shown increased abilities in the classification and other natural language tasks, their architecture poses a hindrance to their use.</p><p>Smit et al [<xref ref-type="bibr" rid="ref3">3</xref>] introduced a combination of existing radiology report labelers and expert annotations to achieve highly accurate automated radiology report labeling . Their approach consists of a bio-medically pretrained BERT model [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], which is trained on the outputs of an existing labeler. They call their resulting model CheXbert. While CheXbert has generated considerable results, it has been unable to capture the full diversity, complexity and the ambiguous nature of natural language in the radiology reports. Their BERT-based solution while providing remarkable performance on the task of labeling is limited in its context length and noncausal nature. Which means for longer radiology reports it fails to provide a solution.</p><p>In our work we propose an LLM based radiology report labeler. LLMs have proven to be successful in their natural language generation capabilities. These models also have longer context lengths which makes them highly suitable for natural language generation tasks. Furthermore, these LLMs are adept at following instructions and given proper instructions these LLMs can be made good labelers for the radiology reports. Our LLM-based model inherently provides ease of use in other LLM-based solutions for the medical domain enabling clinical automation.</p><p>Our generative pretrained transformer (GPT)-based LLM beats the BERT-based CheXbert model on many pathologies and with a far bigger context length can handle long reports as compared with CheXbert. Our model outperforms the previous labelers [<xref ref-type="bibr" rid="ref8">8</xref>] for many pathologies on an external dataset, MIMIC-CXR [<xref ref-type="bibr" rid="ref9">9</xref>]. Our method of training medical report labelers opens room for other labels and longer textual input which makes it broadly useful for natural language processing tasks within the medical domain.</p></sec><sec id="s1-2"><title>Related Work</title><p>Many natural language processing systems have been developed to extract structured labels from unstructured free-text radiology reports [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. Mostly, these methods rely heavily on feature engineering and include strict vocabulary and grammatical rules to find and classify radiological reports. NegEx [<xref ref-type="bibr" rid="ref17">17</xref>], a popular rule-based method, uses simple regular expressions for detecting negation of findings and is often used in combination with the Unified Medical Language System [<xref ref-type="bibr" rid="ref18">18</xref>]. NegBio [<xref ref-type="bibr" rid="ref19">19</xref>], an extension to NegEx, uses universal dependencies and subgraph matching for pattern definition and graph traversal search. It includes uncertainty detection in addition to negation detection for multiple pathologies in radiology reports, and is used to label the ChestX-Ray14 dataset [<xref ref-type="bibr" rid="ref1">1</xref>]. The CheXpert labeler [<xref ref-type="bibr" rid="ref8">8</xref>] improves upon NegBio on chest x-ray report classification by more controlled extraction and an improved Natural Language Processing framework and rules for uncertainty and negation extraction. The CheXpert labeler has been applied to generate labels for the CheXpert dataset and MIMIC-CXR [<xref ref-type="bibr" rid="ref9">9</xref>], which are among the largest chest x-ray datasets publicly available. We use the MIMIC-CXR dataset to train our LLM-based framework and report our findings on a subset of the test set of MIMIC-CXR which has been labeled by expert radiologists.</p><p>Previous approaches have also been trained using radiology reports annotated by expert radiologists [<xref ref-type="bibr" rid="ref20">20</xref>]. In these approaches, training data is limited by radiologist time and expertise. Chen et al [<xref ref-type="bibr" rid="ref21">21</xref>] trained convolutional neural networks with Global Vectors for Word Representation [<xref ref-type="bibr" rid="ref22">22</xref>] on 1000 radiologist-labeled reports for classification of pulmonary embolism in chest computed tomography scan reports and improved upon the previous rule-based peFinder [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. trained both recurrent and convolutional networks in combination with attention mechanisms on 27,593 expert-annotated radiology reports. Transformer-based models have also been applied to the task of radiology report labeling [<xref ref-type="bibr" rid="ref4">4</xref>], trained BERT [<xref ref-type="bibr" rid="ref6">6</xref>] and XLNet-based [<xref ref-type="bibr" rid="ref25">25</xref>] classifiers on 3856 radiologist labeled reports to detect normal and abnormal labels. Wood et al [<xref ref-type="bibr" rid="ref5">5</xref>] proposed ALARM, an MRI head report classifier on head MRI data using BioBERT model [<xref ref-type="bibr" rid="ref26">26</xref>] trained on 1500 radiologist-labeled reports. They demonstrate improvement over previous fixed embedding and word2vec-based [<xref ref-type="bibr" rid="ref27">27</xref>] models [<xref ref-type="bibr" rid="ref28">28</xref>] CheXbert labeler [<xref ref-type="bibr" rid="ref3">3</xref>]. Also proposed a BERT based model which is trained on expert annotated radiology reports and achieves state of the art results for radiology report labeling. However, their method has limitations which include restriction to the context length of 512 which is the limitation of BERT based models.</p><p>Recent advancements in the application of LLMs across various domains, including medical informatics, have demonstrated their versatility and efficacy. Models such as GPT-3 and GPT-4 have been used for diverse tasks, including automated clinical note generation, question answering in health care, and medical coding, showcasing their ability to handle complex and domain-specific language tasks. In radiology, LLMs have been explored for summarizing imaging findings, generating patient-friendly explanations, and aiding in clinical decision-making, highlighting their potential beyond classification tasks. Furthermore, instruction-tuned LLMs, such as ChatGPT and specialized variants like BioGPT, have been shown to adapt effectively to biomedical domains, opening avenues for tasks such as multimodal data interpretation and real-time clinical assistance. These advancements emphasize the need for further exploration of LLMs&#x2019; contextual understanding and adaptability, particularly in radiology report labeling and other biomedical text processing tasks.</p><p>In our work we propose an LLM based solution to solve the task of biomedical text labeling. We not only propose an alternative to BERT based models which achieves better scores on certain labels and has far bigger context length than BERT based models but our approach can also be applied to other biomedical text labeling.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Task</title><p>Radiology report labeling is a critical task involving the extraction of information on the presence or absence of specific thoracic pathologies, such as consolidation or edema, from free-text radiology reports. This process enables the transformation of unstructured diagnostic text into structured data, facilitating clinical decision-making, research and the development of predictive models. In this task, a labeler processes the free-text radiology report as input and assigns 1 of 4 classes, blank, positive, negative, or uncertain, to each of 14 predefined observations, reflecting the certainty level for each prediction. A &#x201C;positive&#x201D; label indicates the presence of a pathology, while &#x201C;negative&#x201D; denotes its absence, and &#x201C;uncertain&#x201D; is used when the report is ambiguous about the condition. The &#x201C;blank&#x201D; class is assigned when no relevant information is available for an observation. By converting radiology reports into labeled data, this approach supports streamlined access to essential diagnostic insights and provides a valuable structured dataset that can be used to train and validate machine learning algorithms in medical imaging.</p></sec><sec id="s2-2"><title>Data</title><p>In radiology, there exist 2 large datasets of chest x-rays, CheXpert [<xref ref-type="bibr" rid="ref8">8</xref>] (consisting of 224,316 images), and MIMIC-CXR [<xref ref-type="bibr" rid="ref9">9</xref>] (consisting of 377,110 images). Both datasets have corresponding radiology reports that have been labeled for the same set of 14 observations using the CheXpert labeler [<xref ref-type="bibr" rid="ref8">8</xref>] from the Impression section, or other parts of the radiology report. Furthermore, a subset of both datasets also contain manual annotations by expert radiologists. On CheXpert, a total of 1000 reports (CheXpert manual set) were reviewed by 2 board certified radiologists with disagreement resolution through consensus. On MIMIC-CXR, a total of 687 reports (MIMIC-CXR test set) were reviewed by 2 board certified radiologists and manually labeled for the same 14 medical pathology labels as in CheXpert. However, the radiology reports for the Chexpert dataset have not been made public. Due to nonavailability of radiology reports for the Chexpert dataset, we used the MIMIC-CXR test set for evaluation.</p></sec><sec id="s2-3"><title>Large Language Models</title><p>LLMs are built upon stacked decoder layers from the transformer architecture, often referred to as &#x201C;auto-regressive models&#x201D; because of their causal structure. This auto-regressive nature enables these models to predict each token sequentially, relying only on the preceding tokens as context. During training, LLMs learn the task of next-token prediction, where they must accurately anticipate the subsequent token in a sequence given the previous tokens as input. This prediction task is essentially a binary classification problem, where the model assesses whether its predicted token matches the correct ground truth token. The accuracy of each prediction is evaluated by calculating a cross-entropy loss, which measures the difference between the model&#x2019;s output and the correct token. This loss is then back-propagated through the model to update its weights, refining its ability to generate contextually accurate and coherent responses over time.</p><p>LLMs are pretrained on extensive amounts of diverse text data from vast resources available on the internet. This extensive pretraining enables the models to capture complex patterns and detailed knowledge present in language, making them highly effective at understanding and generating natural language. Their exceptional performance in natural language generation, coupled with their ability to handle longer text inputs, motivates their application in various specialized domains, such as radiology report labeling.</p><p>In our work, we specifically use the Qwen model [<xref ref-type="bibr" rid="ref29">29</xref>], particularly the Qwen1.5&#x2010;0.5B variant. This model demonstrates significant capabilities in various natural language generation tasks relative to its moderate size, making it both efficient and powerful. One of the Qwen model&#x2019;s key advantages is its extensive context length of 32,000 tokens, meaning it can handle considerably larger text inputs as context compared with BERT-based models, which are typically limited to a 512-token context. This increased context capacity allows the Qwen model to process lengthy radiology reports or extended medical dialogues without truncating important information.</p><p>The enhanced context length in LLMs is particularly beneficial in clinical settings, where maintaining continuity in patient information, such as previous history or ongoing conversations, is essential. As LLM-based frameworks are increasingly adopted in health care, often in the form of biomedical chatbots and other automated systems, the capacity to retain extensive context is critical. Our model addresses this need, providing a solution that integrates seamlessly with existing LLM-driven tools in clinical environments. This ensures that radiology report labeling and other clinical tasks benefit from both accuracy and the ability to preserve a comprehensive, context-aware understanding of patient data.</p></sec><sec id="s2-4"><title>Instruction Fine-Tuning</title><p>To fully use the capabilities of our pretrained LLM, we fine-tune it using a specialized instruction dataset. This dataset is constructed from radiology reports paired with corresponding pathology labels, each with an associated certainty level from the MIMIC-CXR dataset. The goal of this instruction tuning is to guide the pretrained LLM to understand the relationships within radiology reports, enabling it to accurately identify pathologies and assign certainty levels. By providing targeted instructions, we aim to refine the LLM&#x2019;s ability to interpret the clinical language and nuanced patterns within radiology reports, thereby enhancing its performance in radiology report labeling.</p><p>The structure of the instruction dataset is designed to facilitate clear guidance for the LLM. As illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>, each data instance consists of an instruction in the form of a prompt to the LLM, an input value (the radiology report), and output values (the pathology labels along with their certainty levels). This structured approach helps the LLM understand both the format and task requirements, allowing it to generate accurate labels from free-text radiology reports.</p><p>To prepare the data for model processing, each instance in the instruction dataset is tokenized using the LLM&#x2019;s tokenizer, converting text into a sequence of tokens suitable for input. The tokenized data is then fed into the LLM, which generates predictions based on its learned representations. Following this, we calculate the cross-entropy loss, which measures the difference between the model&#x2019;s predicted outputs and the ground truth labels. This loss value indicates how closely the model&#x2019;s predictions align with the actual labels. By back-propagating the loss, we adjust the LLM&#x2019;s weights which gradually enhances its accuracy and reliability in producing pathology labels with certainty levels.</p><p>Through this fine-tuning process, our LLM becomes adept at associating the textual features of radiology reports with relevant pathology labels, allowing it to accurately and efficiently label clinical data. This approach ensures that the model is optimized specifically for radiology report labeling, enabling it to perform well even on complex clinical information.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Instruction fine-tuning process for a large language model to label radiology reports. The dataset includes prompts structured to guide the large language model, input radiology reports, and corresponding pathology labels annotated with certainty levels. This method was applied to the MIMIC Chest X-ray dataset, a large-scale de-identified chest x-ray dataset containing radiologist-labeled reports. LLM: large language model</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68618_fig01.png"/></fig></sec><sec id="s2-5"><title>Training Details</title><p>We trained our LLM, which has a total of 463,987,712 parameters, using an instruction dataset. During the training process, we maintained a batch size of 2, which allows for effective gradient estimation while minimizing memory usage. This choice of batch size is particularly beneficial given the substantial size of our model and the complexity of the task. The training was conducted over 5 epochs, providing sufficient iterations for the model to learn the intricate relationships between the radiology reports and their corresponding pathology labels.</p><p>To optimize the training process, we used the ADAM optimizer [<xref ref-type="bibr" rid="ref30">30</xref>] which facilitated faster convergence and improved performance. In addition, we implemented a gradient accumulation strategy with a value of 2. This approach effectively simulates a larger batch size by accumulating gradients over two iterations before performing a weight update, allowing us to maximize the use of available GPU memory while still benefiting from the stability of a larger batch size.</p><p>The learning rate was set at 1 &#x00D7; 10<sup>&#x2212;4</sup>, a value that strikes a balance between training speed and stability. An appropriate learning rate is crucial in preventing the model from oscillating around the optimal solution, ensuring gradual and consistent improvements in performance. For the instruction tuning of our LLM, we used the SFTTrainer from the TRL library [<xref ref-type="bibr" rid="ref31">31</xref>]. This specialized trainer is designed for supervised fine-tuning, providing an efficient and effective framework for adapting pretrained models to specific tasks. It offers a range of features that streamline the training process, including automated handling of training loops, logging, and monitoring of performance metrics. We leveraged 4 NVIDIA RTX A6000 GPUs. This powerful hardware setup enables parallel processing, significantly reducing training time while accommodating the memory requirements of our large model.</p></sec><sec id="s2-6"><title>Ablation Studies</title><sec id="s2-6-1"><title>Medical Contrastive Language-Image Pretraining Based Similar Reports Retrieval</title><p>To evaluate the effectiveness of our LLM based solution in its longer context we augment the LLM with similar retrieved reports which have been retrieved based on the cosine similarity, Medical Contrastive Language-Image Pretraining (MedCLIP) [<xref ref-type="bibr" rid="ref32">32</xref>] is a Contrastive Language-Image Pretraining (CLIP)-based [<xref ref-type="bibr" rid="ref33">33</xref>] model which is trained on medical images. It is trained in a constrastive manner and based on the cosine similarity between radiology images and texts it placed them closer or farther in its projection space. This helps in tasks such as image-image text-text or image-text and text-image based retrieval. To augment the input radiology report to the LLM, CheXbert, and CheXpert alike, we retrieve similar radiology reports from a datastore similar to [<xref ref-type="bibr" rid="ref34">34</xref>]. MedCLIP has a limitation of 77 tokens as its context length and therefore text-text based retrieval is not possible. To overcome this, we use the radiology image associated with the radiology report of 687 radiologist annotated reports from MIMIC-CXR dataset and retrieve similar radiology images from the train set of MIMIC-CXR, this avoids any data leakage. Then the radiology reports corresponding to these top-k similar retrieved images is taken as the retrieved similar reports which are augmented to the test reports to increase the length of radiological text and test the abilities of labeling methods when longer context input is given. Retrieval is based on MedCLIP representations of input images and reports in the datastore, the datastore is precomputed offline and indexed with FAISS [<xref ref-type="bibr" rid="ref35">35</xref>] for efficient nearest neighbor searching.</p><p>However, the performance of LLM dropped because the retrieved reports are not representative of the accurate information which can be used. Even though our model was able to process the additional information provided, it was unable to perform better because of the quality of those retrieved reports. In future with the advent of chat-bot style LLM solutions, longer bio-medical text in the form of patient-doctor conversation will be available and can be provided to our model as conversation history and this is where our model can be helpful.</p></sec></sec><sec id="s2-7"><title>Ethical Considerations</title><p>To fully use the capabilities of our pretrained LLM, we fine-tune it using a specialized instruction dataset.</p><sec id="s2-7-1"><title>Human Subject Research Ethics Review</title><p>This study did not involve direct human subject research. Instead, it used the MIMIC-CXR dataset, a publicly available, large-scale radiology dataset. The creation and distribution of the dataset adhere to ethical standards, with data anonymized to protect patient privacy. Therefore, specific ethics review and approval for this secondary analysis were not required.</p></sec><sec id="s2-7-2"><title>Informed Consent</title><p>The MIMIC-CXR dataset is derived from hospital records collected during routine clinical care. Informed consent for the use of this data was waived by the Institutional Review Board (IRB) of the Beth Israel Deaconess Medical Center (BIDMC), as the dataset underwent rigorous deidentification processes to ensure compliance with Health Insurance Portability and Accountability Act standards.</p></sec><sec id="s2-7-3"><title>Privacy and Confidentiality</title><p>The dataset has been thoroughly deidentified to ensure anonymity. This includes removing all protected health information from radiology reports, applying optical character recognition and masking techniques to redact protected health information from images, and assigning random identifiers to patients, studies, and images. This guarantees that patient confidentiality is maintained in all analyses.</p></sec><sec id="s2-7-4"><title>Compensation</title><p>No compensation was involved in the collection or use of the MIMIC-CXR dataset, as it is a retrospective collection of clinical records for research purposes.</p></sec><sec id="s2-7-5"><title>Identification of Participants</title><p>There is no risk of identification of individual participants, as all data in the MIMIC-CXR dataset is fully anonymized. The deidentification process for radiology reports, images, and associated metadata adheres to strict privacy protocols to ensure that no identifying information is present in the dataset or the results of this study</p></sec></sec><sec id="s2-8"><title>Evaluation</title><p>In this study, we evaluate the performance of our model, CheXbert and the CheXpert labeler, across a suite of retrieval tasks designed to assess their capabilities in clinical information extraction. Specifically, we focus on 3 main retrieval tasks, namely positive extraction, negative extraction, and uncertainty extraction. For each task, we designate the relevant class as the &#x201C;positive&#x201D; class for classification purposes, meaning, for example, that the &#x201C;negative&#x201D; class is treated as the positive class in the negative extraction task, while other classes (such as positive or uncertain) are treated as negatives. This approach allows us to directly measure the model&#x2019;s ability to distinguish between the specified class and all others, thereby assessing its precision and recall within clinically relevant categories.</p><p>We compute the weighted average of the <italic>F</italic>1 scores for each of the 14 clinical findings or observations present in the CheXpert dataset across these tasks. <italic>F</italic>1 score is a harmonic mean of precision and recall which provides a balanced measure of a model&#x2019;s classification performance, particularly in settings where data is imbalanced. We calculate a weighted average of the <italic>F</italic>1 scores for each of the 14 observations across these tasks. By this we can mitigate the impact of class imbalance and obtain an <italic>F</italic>1 metric that accurately reflects the model&#x2019;s performance across both common and rare observations in the dataset.</p><p>This weighted metric, referred to as weighted-<italic>F</italic>1, is denoted simply as <italic>F</italic>1 in our results. Finally, we calculate and report the average <italic>F</italic>1 score across all 14 clinical observations, offering insight into the model&#x2019;s general extraction capability. This average <italic>F</italic>1 score serves as a key performance metric, allowing for a direct comparison between CheXbert and the CheXpert labeler on clinical information extraction tasks relevant to medical imaging applications in radiology .</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p><xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="table" rid="table2">Table 2</xref> present a quantitative assessment of our model&#x2019;s performance relative to prvious BERT-based approaches for the task of radiology report labeling. <xref ref-type="table" rid="table1">Table 1</xref> reports the <italic>F</italic>1 scores obtained by our model on the MIMIC-CXR dataset, specifically on a radiologist-labeled test set comprising 687 radiology reports. These scores encompass all certainty levels, positive, negative, and uncertain, across each of the 14 pathology categories. The results indicate that our model demonstrates superior performance over previous methods, particularly in the Enlarged Cardiomediastinum and Support Devices categories, while maintaining competitive F1 scores across the other pathology labels.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p><italic>F</italic><sub>1</sub> scores for predictions made by the proposed large language model-based model (ours) compared with CheXbert and CheXpert for all certainty levels (positive, negative, and uncertain). The evaluation was conducted on the MIMIC-CXR test set of 687 radiologist-labeled chest X-ray reports, covering 14 thoracic pathologies.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Pathologies</td><td align="left" valign="bottom">Ours, <italic>F</italic><sub>1</sub> score</td><td align="left" valign="bottom">CheXbert, <italic>F</italic><sub>1</sub> score</td><td align="left" valign="bottom">CheXpert, <italic>F</italic><sub>1</sub> score</td></tr></thead><tbody><tr><td align="left" valign="top">Enlarged cardiomediastinum</td><td align="left" valign="top">0.9022</td><td align="left" valign="top">0.8753</td><td align="left" valign="top">0.8644</td></tr><tr><td align="left" valign="top">Cardiomegaly</td><td align="left" valign="top">0.8555</td><td align="left" valign="top">0.8604</td><td align="left" valign="top">0.8143</td></tr><tr><td align="left" valign="top">Lung Opacity</td><td align="left" valign="top">0.8653</td><td align="left" valign="top">0.8820</td><td align="left" valign="top">0.8459</td></tr><tr><td align="left" valign="top">Lung lesion</td><td align="left" valign="top">0.9612</td><td align="left" valign="top">0.9627</td><td align="left" valign="top">0.9543</td></tr><tr><td align="left" valign="top">Edema</td><td align="left" valign="top">0.9105</td><td align="left" valign="top">0.9191</td><td align="left" valign="top">0.9064</td></tr><tr><td align="left" valign="top">Consolidation</td><td align="left" valign="top">0.9288</td><td align="left" valign="top">0.9385</td><td align="left" valign="top">0.9215</td></tr><tr><td align="left" valign="top">Pneumonia</td><td align="left" valign="top">0.8784</td><td align="left" valign="top">0.8853</td><td align="left" valign="top">0.8474</td></tr><tr><td align="left" valign="top">Atelectasis</td><td align="left" valign="top">0.8613</td><td align="left" valign="top">0.8656</td><td align="left" valign="top">0.8576</td></tr><tr><td align="left" valign="top">Pneumothorax</td><td align="left" valign="top">0.9619</td><td align="left" valign="top">0.9780</td><td align="left" valign="top">0.9572</td></tr><tr><td align="left" valign="top">Pleural effusion</td><td align="left" valign="top">0.8510</td><td align="left" valign="top">0.8649</td><td align="left" valign="top">0.8475</td></tr><tr><td align="left" valign="top">Pleural other</td><td align="left" valign="top">0.9627</td><td align="left" valign="top">0.9623</td><td align="left" valign="top">0.9629</td></tr><tr><td align="left" valign="top">Fracture</td><td align="left" valign="top">0.9734</td><td align="left" valign="top">0.9758</td><td align="left" valign="top">0.9702</td></tr><tr><td align="left" valign="top">Support devices</td><td align="left" valign="top"><italic>0.8607</italic></td><td align="left" valign="top">0.8402</td><td align="left" valign="top">0.8043</td></tr><tr><td align="left" valign="top">No finding</td><td align="left" valign="top">0.8470</td><td align="left" valign="top">0.8557</td><td align="left" valign="top">0.8557</td></tr><tr><td align="left" valign="top">Average</td><td align="left" valign="top">0.9014</td><td align="left" valign="top">0.9047</td><td align="left" valign="top">0.8864</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p><italic>F</italic><sub>1</sub> scores for predictions made by the proposed large language model-based model (ours) compared with CheXbert and CheXpert for all certainty levels (positive, negative, and uncertain). The evaluation was conducted on the MIMIC-CXR test set of 687 radiologist-labeled chest X-ray reports, covering 14 thoracic pathologies.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Pathologies</td><td align="left" valign="bottom">Ours, <italic>F</italic><sub>1</sub> score</td><td align="left" valign="bottom">CheXbert, <italic>F</italic><sub>1</sub> score</td><td align="left" valign="bottom">CheXpert, <italic>F</italic><sub>1</sub> score</td></tr></thead><tbody><tr><td align="left" valign="top">Enlarged cardiomediastinum</td><td align="left" valign="top">0.8907</td><td align="left" valign="top">0.8641</td><td align="left" valign="top">0.8447</td></tr><tr><td align="left" valign="top">Cardiomegaly</td><td align="left" valign="top">0.8233</td><td align="left" valign="top">0.8113</td><td align="left" valign="top">0.7650</td></tr><tr><td align="left" valign="top">Lung Opacity</td><td align="left" valign="top">0.8008</td><td align="left" valign="top">0.8230</td><td align="left" valign="top">0.7770</td></tr><tr><td align="left" valign="top">Lung lesion</td><td align="left" valign="top">0.9490</td><td align="left" valign="top">0.9507</td><td align="left" valign="top">0.9415</td></tr><tr><td align="left" valign="top">Edema</td><td align="left" valign="top">0.8850</td><td align="left" valign="top">0.8934</td><td align="left" valign="top">0.8805</td></tr><tr><td align="left" valign="top">Consolidation</td><td align="left" valign="top">0.9104</td><td align="left" valign="top">0.9222</td><td align="left" valign="top">0.9101</td></tr><tr><td align="left" valign="top">Pneumonia</td><td align="left" valign="top">0.8886</td><td align="left" valign="top">0.8907</td><td align="left" valign="top">0.8568</td></tr><tr><td align="left" valign="top">Atelectasis</td><td align="left" valign="top">0.8256</td><td align="left" valign="top">0.8316</td><td align="left" valign="top">0.8206</td></tr><tr><td align="left" valign="top">Pneumothorax</td><td align="left" valign="top">0.9505</td><td align="left" valign="top">0.9700</td><td align="left" valign="top">0.9470</td></tr><tr><td align="left" valign="top">Pleural effusion</td><td align="left" valign="top">0.7983</td><td align="left" valign="top">0.8143</td><td align="left" valign="top">0.7965</td></tr><tr><td align="left" valign="top">Pleural other</td><td align="left" valign="top">0.9448</td><td align="left" valign="top">0.9457</td><td align="left" valign="top">0.9466</td></tr><tr><td align="left" valign="top">Fracture</td><td align="left" valign="top">0.9626</td><td align="left" valign="top">0.9649</td><td align="left" valign="top">0.9581</td></tr><tr><td align="left" valign="top">Support devices</td><td align="left" valign="top">0.7911</td><td align="left" valign="top">0.7603</td><td align="left" valign="top">0.7069</td></tr><tr><td align="left" valign="top">No finding</td><td align="left" valign="top">0.7704</td><td align="left" valign="top">0.7835</td><td align="left" valign="top">0.7836</td></tr><tr><td align="left" valign="top">Average</td><td align="left" valign="top">0.8708</td><td align="left" valign="top">0.8733</td><td align="left" valign="top">0.8525</td></tr></tbody></table></table-wrap><p><xref ref-type="table" rid="table2">Table 2</xref> further delineates our model&#x2019;s performance by isolating only positive and negative certainty levels for each of the 14 pathology labels, thereby excluding uncertain cases to facilitate direct comparison. In this subset, our model yields notable improvements in <italic>F</italic>1 scores for Enlarged cardiomediastinum, cardiomegaly, and support devices.</p><p>The <italic>P</italic> values presented in <xref ref-type="table" rid="table3">Table 3</xref> provide the results of statistical comparisons between the 3 models (ours, CheXbert, and CheXpert) based on their <italic>F</italic>1 scores. We performed both a paired <italic>t</italic> test and a Wilcoxon test to assess the statistical significance of the differences between the models. The paired <italic>t</italic> test <italic>P</italic> value for the comparison between ours and CheXbert is .35, which indicates that there is no statistically significant difference between the 2 models (<italic>P</italic>&#x003E;.05). However, the comparison between ours and CheXpert yields a paired <italic>t</italic> test <italic>P</italic> value of .01, suggesting that ours performs significantly better than CheXpert at the 5% significance level (<italic>P</italic>&#x003C;.05). Similarly, the Wilcoxon test confirms this finding with a <italic>P</italic> value of .005. The comparison between CheXbert and CheXpert shows highly significant differences with both a paired <italic>t</italic> test <italic>P</italic> value of .0005 and a Wilcoxon test <italic>P</italic> value of .002, indicating that these 2 models also differ significantly in performance. Overall, the statistical tests reveal that while ours and CheXbert are statistically similar, ours outperforms CheXpert, and there is a significant performance difference between CheXbert and CheXpert.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p><italic>P</italic> values from statistical tests (paired <italic>t</italic> test and Wilcoxon test) comparing the performance of the proposed LLM-based model (ours) with CheXbert and CheXpert across 14 thoracic pathologies. The evaluation was conducted on the MIMIC-CXR test set of 687 radiologist-labeled chest X-ray reports, assessing the significance of the differences in <italic>F</italic><sub>1</sub> scores for all certainty levels.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Comparison</td><td align="left" valign="bottom">Paired <italic>t</italic> test <italic>P</italic> value</td><td align="left" valign="bottom">Wilcoxon test <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Ours versus CheXbert</td><td align="left" valign="top">.35</td><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top">Ours versus CheXpert</td><td align="left" valign="top">.01</td><td align="left" valign="top">.005</td></tr><tr><td align="left" valign="top">Chexbert versus CheXpert</td><td align="left" valign="top">.0005</td><td align="left" valign="top">.002</td></tr></tbody></table></table-wrap><p>Compared with the rule-based CheXpert labeler [<xref ref-type="bibr" rid="ref8">8</xref>], our model achieves substantial performance gains across all pathology labels, marking a significant advancement over both traditional rule-based systems and BERT-based models. These results demonstrate the ability of our large language model (LLM)-based approach in handling the task of radiology report labeling with greater precision with large context length as compared with the small context length of BERT based models.</p><p><xref ref-type="fig" rid="figure2">Figure 2</xref> presents the performance of the proposed model (ours) compared with the baseline models (CheXbert and CheXpert) across 14 chest pathologies, using <italic>F</italic>1 scores as the evaluation metric. The grouped bar chart enables a direct comparison, where each group corresponds to a specific pathology, and individual bars represent the scores for each model. The results reveal that all three models achieve consistently high <italic>F</italic>1 scores across most pathologies, indicating robust performance. While the differences are marginal, notable trends can be observed, the proposed model demonstrates an advantage in pathologies such as support devices and enlarged cardiomediastinum, whereas the baseline models perform slightly better in cases like consolidation and pneumothorax. This chart highlights the subtle strengths and weaknesses of each model, providing a clear visual overview of their performance.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Performance comparison of the proposed large language model-based model (ours), CheXbert, and CheXpert across 14 thoracic pathologies. The <italic>F</italic><sub>1</sub> scores for each pathology label were calculated using the MIMIC-CXR test set, comprising 687 radiologist-labeled reports. While all models show similar performance, the proposed model demonstrates competitive or superior results in cases such as &#x201C;support devices&#x201D; and &#x201C;no finding.&#x201D;</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68618_fig02.png"/></fig><p><xref ref-type="fig" rid="figure3">Figure 3</xref> further explores the differences by visualizing the absolute performance gaps between the proposed model (ours) and the 2 baseline models using a heatmap. Rows correspond to the pathologies, and columns display the comparisons of ours against CheXbert and CheXpert. The heatmap emphasizes that the differences are minor, with most pathologies showing a difference of less than 0.02 in <italic>F</italic>1 scores. However, it also highlights key instances where ours performs notably better, such as in support devices and enlarged cardiomediastinum, while the baselines excel slightly in consolidation and pneumothorax. By focusing on performance differences, the heatmap provides a nuanced perspective, making it easy to identify pathologies where the models vary.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Heatmap showing the absolute differences in <italic>F</italic><sub>1</sub> scores between the proposed arge language model-based model (ours) and the baseline models (CheXbert and CheXpert) across 14 thoracic pathologies. The analysis is based on predictions for the MIMIC-CXR test set of 687 radiologist-labeled reports. The heatmap highlights areas where ours excels, such as &#x201C;support devices,&#x201D; and where baseline models perform better, such as &#x201C;consolidation.&#x201D;</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68618_fig03.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Limitations</title><p>Currently, our work is limited to 14 pathology labels prevalent in radiology. However, our method can be extended to other medical fields and can accommodate many more pathology labels and can provide a reliable labeling solution. The lack of long form radiology text limits the demonstration of the longer contextual abilities of our model. In future with the advent of long form bio-medical text in the form of patient-doctor conversations, our model can provide better solutions as opposed to previous models. Furthermore, there is a need of more publicly available radiologist annotated datasets for additional evaluation of trained models. Furthermore, like other LLM-based approaches, our model is susceptible to biases present in the training data, which may inadvertently propagate into its predictions. This is particularly important in the medical domain, where biases can have significant ethical and clinical implications. Additional mechanisms for bias detection and mitigation are necessary to ensure fair and equitable outcomes across diverse patient populations.</p><p>The computational demands of LLMs are another limitation, as training and inference require substantial computational resources. This may limit the accessibility and scalability of the approach, especially in resource-constrained clinical settings. Exploring techniques such as model distillation or low-rank adaptation could help reduce these requirements while maintaining performance.</p><p>Interpretability remains a challenge with LLMs due to their complex architecture. The lack of transparency in how decisions are made can hinder trust and adoption in clinical workflows, where clear justification of predictions is often required. Future work should explore ways to improve the explainability of LLM predictions, such as attention visualization or feature attribution methods tailored to medical tasks.</p><p>While our method demonstrates promising results in radiology, further validation is needed in real-world clinical settings and across diverse medical contexts. The scalability of our approach to other modalities, such as MRI or CT reports, and its adaptability to different healthcare systems must also be explored to ensure generalizability.</p></sec><sec id="s4-2"><title>Principal Findings and Conclusions</title><p>In our work, we propose a novel solution using LLMs for the challenging task of radiology report labeling. Our approach leverages the advanced capabilities of LLMs to accurately interpret and label various pathologies within radiology reports, which are crucial for supporting clinical decision-making and improving patient care. We demonstrate that LLMs achieve superior performance over previous BERT-based models, especially in accurately identifying specific pathology labels. This is a significant advancement, as certain pathologies that were previously challenging to label accurately can now be identified with greater precision using our LLM-based approach. In addition, our model achieves competitive scores across a broad range of pathology labels, confirming its robustness and versatility in handling complex medical language.</p><p>One of the key improvements introduced by our model is its ability to process longer radiology reports, overcoming the context length limitations often encountered in previous models. This capability is essential in clinical settings where comprehensive radiology reports often contain detailed descriptions spanning multiple paragraphs, which traditional models struggle to handle effectively. By accommodating extended context, our model ensures that no critical information is overlooked, thereby enhancing labeling accuracy and reliability in real-world applications.</p><p>Furthermore, the instruction-tuning methodology allows our approach to be extended to other biomedical text labeling tasks beyond radiology. This generalizability of our method makes it highly applicable across a wide range of biomedical domains, opening doors for enhanced automation in various clinical documentation and reporting tasks.</p><p>As LLM-based solutions are increasingly adopted in clinical settings, our model is designed to integrate seamlessly into existing workflows, promoting efficiency, and facilitating clinical automation. The potential for real-time, accurate labeling provided by our model not only reduces the manual workload on health care professionals but also contributes to more timely and precise diagnostic insights, ultimately benefiting patient outcomes.</p><p>Future research should explore integrating LLMs with other medical technologies, such as multimodal imaging systems that combine radiology reports with visual data from x-rays, CT scans, or magnetic resonance images. This integration could enable a more comprehensive analysis and further enhance the clinical utility of LLM-based models. In addition, extending the application of LLMs to different medical imaging modalities and nonradiology domains, such as pathology or cardiology, presents a promising avenue for expanding their impact across health care.</p><p>Developing scalable and efficient LLM solutions tailored to specific clinical needs, including resource-constrained environments, is another critical area for further exploration. This includes investigating lightweight alternatives or model compression techniques to reduce computational demands while maintaining performance.</p><p>Finally, the clinical adoption of LLMs will require continued efforts in improving model interpretability and building trust among health care professionals. Future work should focus on developing user-friendly interfaces and explainability mechanisms to facilitate the seamless integration of LLMs into routine clinical workflows. By addressing these challenges, LLM-based models like ours have the potential to revolutionize clinical practice, enabling more accurate, efficient, and scalable solutions for medical documentation, decision-making, and patient care.</p></sec></sec></body><back><ack><p>This work was supported in part by the National Research Foundation of Korea (NRF) grant funded by the Korea government (MSIT; no.RS-2024-00334321) and by the Institute of Information and Communications Technology Planning and Evaluation (IITP) Grant funded by the Korea Government (MSIT) under Grant (IITP-2022-0-00078; Explainable Logical Reasoning for Medical Knowledge Generation), Grant (IITP-2025-RS-2023-00258649; Information Technology Research Center).</p></ack><notes><sec><title>Data Availability</title><p>The dataset used in this study, MIMIC-CXR, is a publicly available dataset containing chest X-ray images and associated radiology reports. As the MIMIC-CXR dataset is a publicly available dataset, no restrictions are placed on the access to this dataset. The code used for instruction tuning is not currently available, but the authors are happy to discuss and share details upon request</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb2">CLIP</term><def><p>Contrastive Language-Image Pretraining</p></def></def-item><def-item><term id="abb3">GPT</term><def><p>generative pretrained transformer</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MedCLIP</term><def><p>Medical Contrastive Language-Image Pretraining</p></def></def-item><def-item><term id="abb6">NRF</term><def><p>National Research Foundation of Korea</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Bagheri</surname><given-names>M</given-names> </name><name name-style="western"><surname>Summers</surname><given-names>RM</given-names> </name></person-group><article-title>ChestX-ray8: hospital-scale chest x-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases</article-title><conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name><conf-date>2017</conf-date><conf-loc>Honolulu, HI</conf-loc><pub-id pub-id-type="doi">10.1109/CVPR.2017.369</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pons</surname><given-names>E</given-names> </name><name name-style="western"><surname>Braun</surname><given-names>LMM</given-names> </name><name name-style="western"><surname>Hunink</surname><given-names>MGM</given-names> </name><name name-style="western"><surname>Kors</surname><given-names>JA</given-names> </name></person-group><article-title>Natural language processing in radiology: a systematic review</article-title><source>Radiology</source><year>2016</year><month>05</month><volume>279</volume><issue>2</issue><fpage>329</fpage><lpage>343</lpage><pub-id pub-id-type="doi">10.1148/radiol.16142770</pub-id><pub-id pub-id-type="medline">27089187</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Smit</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Pareek</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lungren</surname><given-names>M</given-names> </name></person-group><article-title>Combining automatic labelers and expert annotations for accurate radiology report labeling using BERT</article-title><year>2020</year><conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-loc>Online</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/2020.emnlp-main">https://www.aclweb.org/anthology/2020.emnlp-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.117</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Drozdov</surname><given-names>I</given-names> </name><name name-style="western"><surname>Forbes</surname><given-names>D</given-names> </name><name name-style="western"><surname>Szubert</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>M</given-names> </name><name name-style="western"><surname>Carlin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lowe</surname><given-names>DJ</given-names> </name></person-group><article-title>Supervised and unsupervised language modelling in Chest X-Ray radiological reports</article-title><source>PLoS One</source><year>2020</year><volume>15</volume><issue>3</issue><fpage>e0229963</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0229963</pub-id><pub-id pub-id-type="medline">32155219</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wood</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Lynch</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kafiabadi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Automated labelling using an attention model for radiology reports of MRI scans (ALARM)</article-title><conf-name>Medical Imaging with Deep Learning</conf-name><conf-date>2020</conf-date><conf-loc>Montr&#x00E9;al</conf-loc><fpage>811</fpage><lpage>826</lpage><pub-id pub-id-type="doi">10.48550/arXiv.2002.06588</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name></person-group><article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title><year>2019</year><conf-name>Annual Meeting of the Association for Computational Linguistics</conf-name><conf-loc>Florence</conf-loc><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name></person-group><article-title>Transfer learning in biomedical natural language processing: an evaluation of BERT and elmo on ten benchmarking datasets</article-title><year>2019</year><conf-name>Proceedings of the 18th BioNLP Workshop and Shared Task</conf-name><conf-loc>Florence, Italy</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/W19-50">https://www.aclweb.org/anthology/W19-50</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/W19-5006</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Irvin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>M</given-names> </name><etal/></person-group><article-title>CheXpert: a large chest radiograph dataset with uncertainty labels and expert comparison</article-title><source>AAAI</source><year>2019</year><volume>33</volume><issue>1</issue><fpage>590</fpage><lpage>597</lpage><pub-id pub-id-type="doi">10.1609/aaai.v33i01.3301590</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Greenbaum</surname><given-names>NR</given-names> </name><etal/></person-group><article-title>MIMIC-CXR-JPG, a large publicly available database of labeled chest radiograph</article-title><source>arXiv</source><comment>Preprint posted online on 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1901.07042</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yadav</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sarioglu</surname><given-names>E</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Cartwright</surname><given-names>WB</given-names>  <suffix>4th</suffix></name><name name-style="western"><surname>Hinds</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Chamberlain</surname><given-names>JM</given-names> </name></person-group><article-title>Automated outcome classification of computed tomography imaging reports for pediatric traumatic brain injury</article-title><source>Acad Emerg Med</source><year>2016</year><month>02</month><volume>23</volume><issue>2</issue><fpage>171</fpage><lpage>178</lpage><pub-id pub-id-type="doi">10.1111/acem.12859</pub-id><pub-id pub-id-type="medline">26766600</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hassanpour</surname><given-names>S</given-names> </name><name name-style="western"><surname>Langlotz</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Amrhein</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Befera</surname><given-names>NT</given-names> </name><name name-style="western"><surname>Lungren</surname><given-names>MP</given-names> </name></person-group><article-title>Performance of a machine learning classifier of knee MRI reports in two large academic radiology practices: a tool to estimate diagnostic yield</article-title><source>AJR Am J Roentgenol</source><year>2017</year><month>04</month><volume>208</volume><issue>4</issue><fpage>750</fpage><lpage>753</lpage><pub-id pub-id-type="doi">10.2214/AJR.16.16128</pub-id><pub-id pub-id-type="medline">28140627</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Annarumma</surname><given-names>M</given-names> </name><name name-style="western"><surname>Withey</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Bakewell</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Pesce</surname><given-names>E</given-names> </name><name name-style="western"><surname>Goh</surname><given-names>V</given-names> </name><name name-style="western"><surname>Montana</surname><given-names>G</given-names> </name></person-group><article-title>Automated triaging of adult chest radiographs with deep artificial neural networks</article-title><source>Radiology</source><year>2019</year><month>04</month><volume>291</volume><issue>1</issue><fpage>196</fpage><lpage>202</lpage><pub-id pub-id-type="doi">10.1148/radiol.2018180921</pub-id><pub-id pub-id-type="medline">30667333</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savova</surname><given-names>GK</given-names> </name><name name-style="western"><surname>Masanz</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Ogren</surname><given-names>PV</given-names> </name><etal/></person-group><article-title>Mayo clinical text analysis and knowledge extraction system (cTAKES): architecture, component evaluation and applications</article-title><source>J Am Med Inform Assoc</source><year>2010</year><volume>17</volume><issue>5</issue><fpage>507</fpage><lpage>513</lpage><pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id><pub-id pub-id-type="medline">20819853</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rastegar-Mojarad</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Clinical information extraction applications: a literature review</article-title><source>J Biomed Inform</source><year>2018</year><month>01</month><volume>77</volume><fpage>34</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2017.11.011</pub-id><pub-id pub-id-type="medline">29162496</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>PH</given-names> </name><name name-style="western"><surname>Zafar</surname><given-names>H</given-names> </name><name name-style="western"><surname>Galperin-Aizenberg</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>T</given-names> </name></person-group><article-title>Integrating natural language processing and machine learning algorithms to categorize oncologic response in radiology reports</article-title><source>J Digit Imaging</source><year>2018</year><month>04</month><volume>31</volume><issue>2</issue><fpage>178</fpage><lpage>184</lpage><pub-id pub-id-type="doi">10.1007/s10278-017-0027-x</pub-id><pub-id pub-id-type="medline">29079959</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bozkurt</surname><given-names>S</given-names> </name><name name-style="western"><surname>Alkim</surname><given-names>E</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>I</given-names> </name><name name-style="western"><surname>Rubin</surname><given-names>DL</given-names> </name></person-group><article-title>Automated detection of measurements and their descriptors in radiology reports using a hybrid natural language processing algorithm</article-title><source>J Digit Imaging</source><year>2019</year><month>08</month><volume>32</volume><issue>4</issue><fpage>544</fpage><lpage>553</lpage><pub-id pub-id-type="doi">10.1007/s10278-019-00237-9</pub-id><pub-id pub-id-type="medline">31222557</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chapman</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Bridewell</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hanbury</surname><given-names>P</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>GF</given-names> </name><name name-style="western"><surname>Buchanan</surname><given-names>BG</given-names> </name></person-group><article-title>A simple algorithm for identifying negated findings and diseases in discharge summaries</article-title><source>J Biomed Inform</source><year>2001</year><month>10</month><volume>34</volume><issue>5</issue><fpage>301</fpage><lpage>310</lpage><pub-id pub-id-type="doi">10.1006/jbin.2001.1029</pub-id><pub-id pub-id-type="medline">12123149</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bodenreider</surname><given-names>O</given-names> </name></person-group><article-title>The unified medical language system (UMLS): integrating biomedical terminology</article-title><source>Nucleic Acids Res</source><year>2004</year><month>01</month><day>1</day><volume>32</volume><issue>Database issue</issue><fpage>D267</fpage><lpage>70</lpage><pub-id pub-id-type="doi">10.1093/nar/gkh061</pub-id><pub-id pub-id-type="medline">14681409</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bagheri</surname><given-names>M</given-names> </name><name name-style="western"><surname>Summers</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name></person-group><article-title>NegBio: a high-performance tool for negation and uncertainty detection in radiology reports</article-title><year>2018</year><conf-name>AMIA Summits on Translational Science Proceedings</conf-name><pub-id pub-id-type="doi">10.48550/arXiv.1712.05898</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xue</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ruan</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>He</surname><given-names>P</given-names> </name></person-group><article-title>Fine-tuning BERT for joint entity and relation extraction in chinese medical text</article-title><conf-name>2019 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name><conf-date>2019</conf-date><conf-loc>San Diego, CA, USA</conf-loc><pub-id pub-id-type="doi">10.1109/BIBM47256.2019.8983370</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Ball</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Deep learning to classify radiology free-text reports</article-title><source>Radiology</source><year>2018</year><month>03</month><volume>286</volume><issue>3</issue><fpage>845</fpage><lpage>852</lpage><pub-id pub-id-type="doi">10.1148/radiol.2017171115</pub-id><pub-id pub-id-type="medline">29135365</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pennington</surname><given-names>J</given-names> </name><name name-style="western"><surname>Socher</surname><given-names>R</given-names> </name><name name-style="western"><surname>Manning</surname><given-names>CD</given-names> </name></person-group><article-title>Glove: global vectors for word representation</article-title><conf-name>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-date>2014</conf-date><conf-loc>Doha, Qatar</conf-loc><pub-id pub-id-type="doi">10.3115/v1/D14-1162</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chapman</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>HP</given-names> </name><name name-style="western"><surname>Chapman</surname><given-names>WW</given-names> </name></person-group><article-title>Document-level classification of CT pulmonary angiography reports based on an extension of the ConText algorithm</article-title><source>J Biomed Inform</source><year>2011</year><month>10</month><volume>44</volume><issue>5</issue><fpage>728</fpage><lpage>737</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2011.03.011</pub-id><pub-id pub-id-type="medline">21459155</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bustos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pertusa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Salinas</surname><given-names>JM</given-names> </name><name name-style="western"><surname>de la Iglesia-Vay&#x00E1;</surname><given-names>M</given-names> </name></person-group><article-title>PadChest: A large chest x-ray image dataset with multi-label annotated reports</article-title><source>Med Image Anal</source><year>2020</year><month>12</month><volume>66</volume><fpage>101797</fpage><pub-id pub-id-type="doi">10.1016/j.media.2020.101797</pub-id><pub-id pub-id-type="medline">32877839</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name></person-group><article-title>XLNet: generalized autoregressive pretraining for language understanding</article-title><year>2019</year><conf-name>Proceedings of the 33rd International Conference on Neural Information Processing Systems</conf-name><conf-loc>Vancouver</conf-loc><pub-id pub-id-type="doi">10.48550/arXiv.1906.08237</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>J</given-names> </name></person-group><article-title>Distributed representations of words and phrases and their compositionality</article-title><year>2013</year><conf-loc>Lake Tahoe</conf-loc><pub-id pub-id-type="doi">10.48550/arXiv.1310.4546</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zech</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pain</surname><given-names>M</given-names> </name><name name-style="western"><surname>Titano</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Natural language-based machine learning models for the annotation of clinical radiology reports</article-title><source>Radiology</source><year>2018</year><month>05</month><volume>287</volume><issue>2</issue><fpage>570</fpage><lpage>580</lpage><pub-id pub-id-type="doi">10.1148/radiol.2018171093</pub-id><pub-id pub-id-type="medline">29381109</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Bai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Qwen technical report</article-title><source>arXiv</source><comment>Preprint posted online on 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.16609</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kingma</surname><given-names>DP</given-names> </name></person-group><article-title>Adam: A method for stochastic optimization</article-title><year>2015</year><conf-name>International Conference for Learning Representations</conf-name><conf-loc>San Diego</conf-loc><pub-id pub-id-type="doi">10.48550/arXiv.1412.6980</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>von Werra</surname><given-names>L</given-names> </name><name name-style="western"><surname>Belkada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tunstall</surname><given-names>L</given-names> </name><etal/></person-group><article-title>TRL: transformer reinforcement learning</article-title><year>2020</year><access-date>2025-03-19</access-date><publisher-name>GitHub, GitHub repository</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/huggingface/trl">https://github.com/huggingface/trl</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>MedCLIP: contrastive learning from unpaired medical images and text</article-title><conf-name>Proc Conf Empir Methods Nat Lang Process</conf-name><conf-date>2022</conf-date><fpage>3876</fpage><lpage>3887</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.256</pub-id><pub-id pub-id-type="medline">39144675</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wook Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hallacy</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Learning transferable visual models from natural language supervision</article-title><conf-name>International conference on machine learning</conf-name><conf-date>2021</conf-date><pub-id pub-id-type="doi">10.48550/arXiv.2103.00020</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ramos</surname><given-names>R</given-names> </name><name name-style="western"><surname>Martins</surname><given-names>B</given-names> </name><name name-style="western"><surname>Elliott</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kementchedjhieva</surname><given-names>Y</given-names> </name></person-group><article-title>Smallcap: lightweight image captioning prompted with retrieval augmentation</article-title><conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name><conf-date>2023</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00278</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>J</given-names> </name><name name-style="western"><surname>Douze</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jegou</surname><given-names>H</given-names> </name></person-group><article-title>Billion-scale similarity search with GPUs</article-title><source>IEEE Trans Big Data</source><year>2021</year><month>07</month><volume>7</volume><issue>3</issue><fpage>535</fpage><lpage>547</lpage><pub-id pub-id-type="doi">10.1109/TBDATA.2019.2921572</pub-id></nlm-citation></ref></ref-list></back></article>