<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e81644</article-id><article-id pub-id-type="doi">10.2196/81644</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Understanding Transformer-Based Classifications of Medical Text Using a Large Language Model for the Attribution of Feature Importance: Proof-of-Concept Algorithm Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Fangwen</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Saha</surname><given-names>Ashirbani</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Afzal</surname><given-names>Muhammad</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Parrish</surname><given-names>Rick</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Haynes</surname><given-names>R Brian</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Iorio</surname><given-names>Alfonso</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lokker</surname><given-names>Cynthia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Health Information Research Unit, Department of Health Research Methods, Evidence, and Impact, Faculty of Health Sciences, McMaster University</institution><addr-line>1280 Main Street West</addr-line><addr-line>Hamilton</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff2"><institution>Department of Oncology, Faculty of Health Sciences, McMaster University</institution><addr-line>Hamilton</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff3"><institution>Department of Computer Science, Birmingham City University</institution><addr-line>Birmingham</addr-line><country>United Kingdom</country></aff><aff id="aff4"><institution>Department of Medicine, Faculty of Health Sciences, McMaster University</institution><addr-line>Hamilton</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lin</surname><given-names>Kuan-Hsun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Jobayer</surname><given-names>Md</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Cynthia Lokker, PhD, Health Information Research Unit, Department of Health Research Methods, Evidence, and Impact, Faculty of Health Sciences, McMaster University, 1280 Main Street West, Hamilton, ON, L8S 4L8, Canada, 1 905-525-9140 ext 22208; <email>lokkerc@mcmaster.ca</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>10</day><month>6</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e81644</elocation-id><history><date date-type="received"><day>31</day><month>07</month><year>2025</year></date><date date-type="rev-recd"><day>29</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>29</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Fangwen Zhou, Ashirbani Saha, Muhammad Afzal, Rick Parrish, R Brian Haynes, Alfonso Iorio, Cynthia Lokker. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 10.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e81644"/><abstract><sec><title>Background</title><p>Deep learning, particularly encoder-only transformer architectures, has demonstrated excellent performance in biomedical literature classification, facilitating evidence-based medicine, and knowledge synthesis. However, the opacity of these models&#x2019; decision-making processes limits their clinical interpretability, trustworthiness, and widespread adoption. Traditional explainable artificial intelligence methods, such as Shapley Additive Explanations (SHAP) and integrated gradients (IG), address this issue but often incur substantial computational overhead for text classification. Generative large language models may offer a novel approach to generating interpretable, context-aware explanations as autonomous agents.</p></sec><sec><title>Objective</title><p>As a proof-of-concept, the study aimed to investigate the effectiveness of GPT-4o as a standalone, end-to-end perturbation-based explainer for a BioLinkBERT text classifier. We compared its explanations against the SHAP partition explainer and IG as established baselines in terms of explanation faithfulness and semantic alignment.</p></sec><sec sec-type="methods"><title>Methods</title><p>A stratified sample of 200 studies from the McMaster Premium Literature Service (PLUS) and Clinical Hedges databases was classified by a fine-tuned BioLinkBERT model for methodological rigor. The sampling specifically over-represented difficult, low-confidence predictions to rigorously test the explainers, with an equal number of studies sampled from each probability decile predicted by BioLinkBERT. GPT-4o, SHAP, and IG generated token-level feature attributions across a robust feature space of 80,901 tokens. GPT-based explanations were derived through a sophisticated, iterative masking perturbation workflow under 2 prompting schemes (token indices vs explicit subword tokens). Explanations were evaluated using a rank-based, modified area over the perturbation curve (AOPC), pairwise correlation analyses, and qualitative assessment of feature importance.</p></sec><sec sec-type="results"><title>Results</title><p>Among the 200 studies, 80,901 tokens were included, and feature attributions were generated by the 4 explainers (6369 unique tokens). SHAP (AOPC 0.222, 95% CI 0.200-0.244) and IG (AOPC 0.225, 95% CI 0.202-0.247) provided consistent explanations, effectively identifying tokens relevant to study rigor (eg, &#x201C;randomized&#x201D; and &#x201C;blind&#x201D;). In contrast, despite evaluating a larger perturbation space, the GPT-4o prompting schemes did not achieve comparable faithfulness (AOPC 0.025-0.029) and produced divergent token attributions. Correlation analysis demonstrated moderate alignment between SHAP and IG (Pearson <italic>r=</italic>0.367), whereas GPT-4o exhibited limited correlation (Pearson <italic>r</italic>&#x2264;0.032) with the established baselines. Sensitivity analyses isolating only correctly classified instances yielded similar trends. Additionally, the iterative application programming interface calls required for GPT made it significantly more computationally intensive and costly to execute, whereas IG was the most temporally efficient.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Despite their advanced contextual capabilities, current generative large language models are limited when deployed as standalone perturbation explainers. The findings reveal that GPT-4o struggles to accurately synthesize mathematical feature importance through iterative masking, lacking the reliability of traditional explainable artificial intelligence frameworks. Future research could build upon this work and investigate specialized prompt engineering, whole-word recombination strategies, and hybrid frameworks.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>explainable artificial intelligence</kwd><kwd>feature attribution</kwd><kwd>integrated gradients</kwd><kwd>Shapley Additive Explanations</kwd><kwd>SHAP</kwd><kwd>GPT</kwd><kwd>deep learning</kwd><kwd>natural language processing</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The rapid growth of biomedical literature has driven the development of automated classification systems to facilitate knowledge synthesis and translation [<xref ref-type="bibr" rid="ref1">1</xref>]. Deep learning, particularly encoder-only transformer architectures such as Bidirectional Encoder Representations from Transformers (BERT), has gained significant attention in biomedical text classification [<xref ref-type="bibr" rid="ref2">2</xref>]. These models excel due to their ability to capture contextual information, leverage transfer learning, and minimize the need for extensive data preprocessing and feature engineering, making them highly effective for biomedical applications [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>However, the complex, multilayered nature of BERT models undermines their interpretability, posing challenges in understanding their decision-making processes [<xref ref-type="bibr" rid="ref5">5</xref>]. Explainable artificial intelligence (XAI) techniques aim to address this limitation by providing insights into feature importance [<xref ref-type="bibr" rid="ref6">6</xref>]. One widely used XAI framework is Shapley Additive Explanations (SHAP), which is grounded in game theory and uses Shapley values to systematically estimate feature contributions by perturbing inputs [<xref ref-type="bibr" rid="ref7">7</xref>]. Despite its theoretical robustness, SHAP has substantial computational overhead. It requires summing marginal contributions across feature subsets, which leads to an exponential increase in complexity as the feature space grows [<xref ref-type="bibr" rid="ref8">8</xref>]. Consequently, computing SHAP values becomes impractical for BERT models that process long sequences of up to 512 tokens.</p><p>To mitigate this challenge, a partition explainer groups features into structured partitions, which reduces complexity while preserving interactions. By approximating Shapley values using Owen values [<xref ref-type="bibr" rid="ref9">9</xref>], the partition explainer enhances scalability, making it particularly suitable for high-dimensional text classification tasks. Another widely used method is integrated gradients (IG) based on the Aumann-Shapley method, which ensures axiomatic fairness and path-integrated attribution of feature importance [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. It offers a computationally efficient approach to estimating feature importance by measuring the accumulated gradients along the path between the baseline input and the instance input. IG has been widely applied in natural language processing (NLP) tasks, providing a balance between interpretability and computational feasibility [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. However, these methods face challenges in explaining text classifiers due to significant multicollinearity between input tokens and high-dimensional feature spaces [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>].</p><p>More recently, pretrained generative large language models (LLMs) leveraging transformer decoders have garnered wide attention in NLP due to their performance and flexibility [<xref ref-type="bibr" rid="ref17">17</xref>]. Previous studies, such as those by Zytek et al [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>] and Zeng and Zhu [<xref ref-type="bibr" rid="ref20">20</xref>], explored LLMs in model explanation, investigating the use of LLMs to convert SHAP explanations into plain-text descriptions to improve human interpretability. Unlike perturbation-based XAI methods or gradient-based XAI methods, LLMs can generate explanations while incorporating token-level contextual relationships, potentially leading to more faithful feature attributions. More recently, LLMs have started to support structured JSON output and function calling, providing a convenient way to integrate model predictions [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Despite these advances, no prior studies have explored the usage of LLMs as standalone explainers for deep learning models in biomedical text classification. To address this gap, as a proof-of-concept, we develop and validate a methodology to investigate GPT-4o by OpenAI, as an end-to-end, agentic perturbation explainer for a BERT-based biomedical text classifier. We compare its performance against SHAP&#x2019;s partition explainer and IG explanations.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Classifier and Dataset Description</title><p>This study builds upon the work from a previous study [<xref ref-type="bibr" rid="ref24">24</xref>], where we fine-tuned 630 encoder-only transformer models using grid search. The data came from the McMaster Premium Literature Service (PLUS) and the Clinical Hedges databases associated with the McMaster Health Information Research Unit. Detailed descriptions of these 2 databases are published elsewhere [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. In short, both databases include treatment, primary prevention, and/or quality improvement studies that had been manually appraised using custom criteria for randomized controlled trials [<xref ref-type="bibr" rid="ref29">29</xref>] to determine whether they were methodologically rigorous or nonrigorous. Studies in the PLUS database from inception (2003-2023; n=53,219) were used for training (n=42,575), validation (n=5322), and testing (n=5322). Studies from 2024 in McMaster PLUS (n=1011) and the Clinical Hedges (n=6572) were used for external testing. The top-performing models were identified on the validation set and subsequently tested.</p><p>For this study, we selected a stratified random sample of 200 studies, 40 from each dataset. For each of the 5 data subsets, studies were placed into 10 bins based on their predicted probability for rigor, and a random sample of 4 studies per probability bin per dataset was selected. The probability scores were generated by the model that had the lowest validation loss, which was a BioLinkBERT-based model with a learning rate of 3 &#x00D7; 10<sup>5</sup>, a batch size of 64, a random seed of 2, and included class weight adjustments. The model was fine-tuned for 5 epochs before premature termination by early stopping, and weights from epoch 2 were used as it achieved the lowest validation loss. Other relevant configurations can be found in our previous publication [<xref ref-type="bibr" rid="ref26">26</xref>]. The model achieved a cross-entropy loss of 0.291, an area under the receiver-operating characteristic curve of 0.941, and an accuracy of 0.879 on the full validation set.</p></sec><sec id="s2-2"><title>SHAP Explanations</title><p>We used the SHAP partition explainer [<xref ref-type="bibr" rid="ref30">30</xref>] to compute an Owen value for each token in each prediction. The partition explainer was chosen due to its efficiency in high-dimensional text classification and its ability to capture feature interactions more effectively than standard Shapley value approximations [<xref ref-type="bibr" rid="ref31">31</xref>]. SHAP values were calculated using logits that were back-transformed from SoftMax probabilities.</p></sec><sec id="s2-3"><title>IG Explanations</title><p>We used IG to estimate token-level feature attributions for each prediction. We used an empty sequence padded to 512 tokens with &#x201C;[PAD]&#x201D; as the baseline input, ensuring the absence of semantic content while preserving the tokenization structure. The baseline input produces a rigorous probability of 11.7%. Attributions were derived by computing gradients with respect to the input embeddings across 30 interpolation steps. The total IG attribution per token was calculated by aggregating gradients across all embedding dimensions.</p></sec><sec id="s2-4"><title>GPT Explanations</title><sec id="s2-4-1"><title>Overview</title><p>We used GPT-4o-2024-11-20 with a temperature of 0, and both presence and frequency penalties set to 0, to ensure deterministic outputs. The objective was to evaluate GPT&#x2019;s ability to estimate token-level feature attributions through perturbation-based explanations, similar to SHAP. A total of 2 prompting schemes, GPT-index and GPT-token, were designed to systematically mask tokens and assess their influence on classifier predictions. Tokens were obtained by processing the original input through BioLinkBERT&#x2019;s word-piece tokenizer. Both schemes received the number of input tokens, predicted logits for both classes, and the probability of the positive class. Additionally, GPT-token was provided with the complete list of input tokens in a comma-separated format and the manual appraisal criteria. The full prompts used for both schemes are available in Tables S1-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. A flow diagram can be found in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the pipeline for generating explanations and feature attributions from GPT-index and GPT-token. The pipeline illustrates the iterative perturbation-based workflow, including the developer prompt, the initial user prompt, repeated masking iterations, and batched feature attribution generation using structured outputs. &#x2020;Input tokens are included only in the initial user prompt for GPT-token. &#x2021;The information and instructions provided in the developer prompt differ for GPT-index and GPT-token, as GPT-index was not provided with the input tokens. Detailed prompts can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81644_fig01.png"/></fig></sec><sec id="s2-4-2"><title>Developer Prompt</title><p>In the developer prompt, GPT was provided with (1) the role of a machine learning model explainer, (2) the task of explaining a binary encoder-only transformer text classifier&#x2019;s prediction via perturbations by masking input tokens, (3) the scheme-specific information that would be provided in the user prompts, and (4) step-by-step instructions on defining importance, masking, function calling, and generating importance values that would be executed subsequently. The manual appraisal criteria [<xref ref-type="bibr" rid="ref31">31</xref>] for GPT-tokens were included in the developer prompt.</p></sec><sec id="s2-4-3"><title>Initial User Prompt</title><p>In the initial user prompt, both prompting schemes (GPT-index and GPT-token) were provided with the number of tokens, the predicted logits of the positive and negative classes, and the probability of the positive class. The input tokens, in the format of a comma-separated list, were provided to GPT-token only in the initial user prompt.</p></sec><sec id="s2-4-4"><title>Subsequent User Prompts</title><p>The model was first instructed to generate the definition of &#x201C;importance&#x201D; for itself and then to call <italic>mask_and_predict</italic> with lists of individual indices (eg, [[0] [<xref ref-type="bibr" rid="ref1">1</xref>], &#x2026; [x-1]], for an input with x tokens), echoing the instructions provided in the developer prompt. To call <italic>mask_and_predict</italic>, we used the function-calling feature [<xref ref-type="bibr" rid="ref32">32</xref>] in OpenAI&#x2019;s application programming interface (API). The function, in general, takes lists of integers as input and returns the logits for both classes and the probability of the positive class for each list of indices to mask, with every token at the index replaced with &#x201C;[MASK].&#x201D;</p><p>Subsequently, the model was prompted 10 times to generate 10 to 30 lists with any number of indices to mask and call <italic>mask_and_predict</italic>, where each iteration included the results of all previous iterations. The model was explicitly instructed to avoid generating the same combinations of indices and to adapt future masking based on prior iteration results. Finally, the model was asked to redefine &#x201C;importance&#x201D; based on the initial definition and the results of all masking iterations.</p></sec><sec id="s2-4-5"><title>Feature Attribution Calculations</title><p>The model was prompted, with the final message chain including the initial user prompt, all iterations of perturbations, and both iterations of importance definition, to generate the feature importance for each token, 20 tokens per batch. Crucially, while the output generation was batched to bypass GPT&#x2019;s limitations in generating long structured sequences, the full global context of all masking permutations and model predictions was retained in the prompt for every batch. The model was not provided with the calculated feature attributions of other batches, as the mathematical calculation based on its own definition only required the global perturbation history, which was always present. This batched approach was taken because the model often had issues with generating longer sequences. The structured output function [<xref ref-type="bibr" rid="ref33">33</xref>] of the API was leveraged to generate a list of dictionaries of token indices and their corresponding feature attributions.</p></sec></sec><sec id="s2-5"><title>Evaluation</title><sec id="s2-5-1"><title>Area Over the Perturbation Curve</title><p>To establish feature attribution performance, we used a modified version of the area over the perturbation curve (AOPC), which was used in previous literature [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. The AOPC was calculated for each explanation individually and then averaged across all 200 instances.</p><p>The original AOPC is calculated using the formula in <xref ref-type="disp-formula" rid="E1">Equation 1</xref>.</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>A</mml:mi><mml:mi>O</mml:mi><mml:mi>P</mml:mi><mml:mi>C</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>K</mml:mi></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:munderover><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>P</italic>(<italic>x</italic>) is the predicted probability for the positive class with the original input <italic>x</italic>, <italic>x</italic><sup>(</sup><italic><sup>i</sup></italic><sup>)</sup> is the perturbed input with the top <italic>i</italic> important features removed or masked, and <italic>K</italic> is the number of perturbation steps. This formula assumes that features contribute to the positive class; hence, their removal would result in a decrease in the predicted probability, and <italic>P</italic>(<italic>x</italic>)<italic>&#x2013;P</italic>(<italic>x</italic><sup>(</sup><italic><sup>i</sup></italic><sup>)</sup>) would be positive. Crucially, because AOPC relies on iteratively masking the top-k features, it is fundamentally a rank-based metric; it evaluates the explainer&#x2019;s ability to correctly order feature importance rather than its precision in quantifying absolute attribution values.</p><p>For binary text classification, feature attributions could be associated with a negative value, indicating more support for the negative class [<xref ref-type="bibr" rid="ref34">34</xref>]. Under such circumstances, their removal would lead to an increase in the probability of the positive class. For this reason, we adapted the AOPC formula in <xref ref-type="disp-formula" rid="E2">Equation 2</xref>.</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>A</mml:mi><mml:mi>O</mml:mi><mml:mi>P</mml:mi><mml:mi>C</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:msub><mml:mi>K</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow></mml:mfrac><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>K</mml:mi><mml:mi>p</mml:mi></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>K</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>x</italic><sup>(</sup><italic><sup>i</sup></italic><sup>)</sup> and <italic>x</italic><sup>(</sup><italic><sup>j</sup></italic><sup>)</sup> are the perturbed inputs with the top <italic>i</italic> positively-attributed features and the top <italic>j</italic> negatively-attributed features masked, respectively. <italic>K<sub>p</sub></italic> and <italic>K<sub>n</sub></italic> are the number of perturbation steps for the positive features and negative features, respectively, which, in this case, would be equal to the number of positively and negatively attributed tokens. Similar to the original metric, a larger value would indicate higher attribution faithfulness. Note that the operands corresponding to the &#x201C;+&#x201D; operation must be computed separately (to enable the removal of positive features and negative features separately) before the final summation is performed.</p></sec><sec id="s2-5-2"><title>Correlation Analysis</title><p>The pairwise correlation between feature attributions for each of the 4 methods (SHAP, IG, GPT-index, and GPT-token) was assessed using Pearson <italic>r</italic>, Spearman <italic>&#x03C1;</italic>, and Kendall <italic>&#x03C4;</italic>. Distribution similarity was measured using the Wasserstein distance. A <italic>P</italic> value of .05 or less is indicative of statistical significance. The distributions of feature attributions were visualized using scatter plots.</p></sec><sec id="s2-5-3"><title>Feature Importance Attributions</title><p>The 10 most important features that had an occurrence of &#x2265;1, &#x2265;10, and &#x2265;100 for each explainer were examined using bar graphs.</p></sec></sec><sec id="s2-6"><title>Sensitivity Analysis</title><p>We conducted sensitivity analyses, including instances that were correctly classified only, to explore the impact of classification accuracy on explanation faithfulness.</p></sec><sec id="s2-7"><title>Hardware and Software</title><p>We used the resources from the Cedar cluster of the Digital Research Alliance of Canada. Training, evaluation, and explanation were conducted using 1 NVIDIA V100 Volta (32 GB HBM2 memory), as well as an allocation of 8 cores and 40 GB of memory. Querying of GPT was conducted locally with an AMD 9950x and 64GB system memory.</p><p>Visual Studio Code (Microsoft Corp) and Python 3.11.9 (Python Software Foundation) were used for all software development. We used the <italic>transformers</italic> library by Hugging Face to obtain pretrained models, and <italic>torch</italic> was used for evaluation purposes. The <italic>shap</italic> and <italic>captum</italic> libraries were used to calculate feature attributions via partition explainer and IG, respectively. The <italic>openai</italic> library was used to query GPT-4o. Data management and statistical analysis were conducted using <italic>pandas</italic>, <italic>numpy</italic>, and <italic>scikit-learn</italic>. Data visualization was done with <italic>matplotlib</italic> and <italic>seaborn</italic>. The full list of libraries used on the Digital Research Alliance of Canada and the local environment can be found in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This study exclusively involved the computational analysis of previously published biomedical and clinical literature originating from the McMaster PLUS and Clinical Hedges databases. As the research relied entirely on the secondary analysis of publicly available, published documents and did not involve the collection of data from or interaction with human subjects, it is exempt from institutional ethics review. Consequently, requirements regarding informed consent, human subject privacy and confidentiality protections, and participant compensation are not applicable to this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Characteristics of the Dataset and Classifier</title><p>The original dataset contained 60,802 instances, of which 34,090 (56.1%) were rigorous. After stratified sampling, the 200 instances contained 83 (41.5%) rigorous studies. Within this dataset, the BioLinkBERT model achieved a cross-entropy loss of 0.527, an area under the receiver-operating characteristic curve of 0.812, and an accuracy of 0.705 using the default threshold of a predicted probability of 0.50 or more. The 200 instances contained a total of 80,901 tokens, of which 6369 were unique.</p></sec><sec id="s3-2"><title>Importance Definitions by GPT</title><p>GPT, in both prompting schemes, was instructed to define &#x201C;importance&#x201D; after being provided with the initial user prompt and subsequently redefine &#x201C;importance&#x201D; after all iterations of masking had been completed. Both GPT-index and GPT-token initially defined &#x201C;importance&#x201D; as the change in the predicted probability of the positive class before and after masking for all 200 instances.</p><p>After redefinition for GPT-index, the definition remained consistent as the change in predicted probability in 199 (99.5%) instances. Of these, 3 (1.5%), 37 (18.5%), and 16 (8%) instances normalized the change by logits, initial predicted probability, and the number of masked tokens in a perturbation, respectively. The remaining instance used the change in the difference between the positive and negative logits as the definition of importance.</p><p>For GPT-token, the definition for all 200 instances remained consistent as the change in predicted probability. Among these, 67 (33.5%) and 9 (4.5%) instances normalized the change by the initial predicted probability and the number of tokens masked, respectively.</p></sec><sec id="s3-3"><title>AOPC Analysis</title><p>SHAP and IG explanations achieved similar faithfulness, with a mean (95% CI) of 0.222 (0.200-0.244) and 0.225 (0.202-0.247), respectively (<xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="fig" rid="figure2">Figure 2</xref>). SHAP was better at identifying negative tokens, while IG was better at identifying positive tokens. The GPT-index and GPT-token schemes yielded substantially lower AOPC scores of 0.025 (0.012-0.038) and 0.029 (0.014-0.043), respectively. Notably, both schemes produced inverted (negative) AOPC values for negative tokens, indicating a divergence in baseline attribution logic.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance of 4 explainers (Shapley Additive Explanations [SHAP], integrated gradients [IG], GPT-index, and GPT-token) based on the mean area over the perturbation curve (AOPC) across 200 stratified studies sampled from the McMaster Premium Literature Service (PLUS) and Clinical Hedges databases (2003-2024), classified for methodological rigor using a fine-tuned BioLinkBERT model. Higher AOPC indicates greater attribution faithfulness<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Explainer</td><td align="left" valign="bottom">AOPC (all tokens), mean (95% CI)</td><td align="left" valign="bottom">AOPC (Tokens with positive attributions), mean (95% CI)</td><td align="left" valign="bottom">AOPC (Tokens with negative attributions), mean (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">SHAP</td><td align="left" valign="top">0.222 (0.200 to 0.244)</td><td align="left" valign="top">0.277 (0.249 to 0.306)</td><td align="left" valign="top">0.037 (0.030 to 0.044)</td></tr><tr><td align="left" valign="top">IG</td><td align="left" valign="top">0.225 (0.202 to 0.247)</td><td align="left" valign="top">0.326 (0.293 to 0.359)</td><td align="left" valign="top">0.026 (0.019 to 0.033)</td></tr><tr><td align="left" valign="top">GPT-index</td><td align="left" valign="top">0.025 (0.012 to 0.038)</td><td align="left" valign="top">0.045 (0.028 to 0.063)</td><td align="left" valign="top">&#x2212;0.021 (&#x2212;0.034 to &#x2212;0.008)</td></tr><tr><td align="left" valign="top">GPT-token</td><td align="left" valign="top">0.029 (0.014 to 0.043)</td><td align="left" valign="top">0.049 (0.029 to 0.068)</td><td align="left" valign="top">&#x2212;0.021 (&#x2212;0.031 to &#x2212;0.010)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>All values are shown as the mean (95% CI) across the 200 instances.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Perturbation curves of the 4 explainers (Shapley Additive Explanations [SHAP], integrated gradients [IG], GPT-index, and GPT-token) across 200 stratified studies sampled from the McMaster Premium Literature Service (PLUS) and Clinical Hedges databases (2003-2024), classified for methodological rigor using a fine-tuned BioLinkBERT model. Shaded areas represent the 95% CI. (A) All tokens, (B) tokens with positive attributions, and (C) tokens with negative attributions.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81644_fig02.png"/></fig></sec><sec id="s3-4"><title>Sign Inversion Error Analysis</title><p>To analyze whether the negative AOPC values for negatively attributed features from the GPT explainers were a result of a systematic sign error, we systematically inverted the signs of all feature attributions and recalculated their AOPC values. After inversion, the AOPC for all, positively attributed, and negatively attributed tokens were &#x2212;0.019 (&#x2212;0.032 to &#x2212;0.006), 0.032 (0.019-0.046), and &#x2212;0.046 (&#x2212;0.063 to &#x2212;0.028) for GPT-index, and &#x2212;0.028 (&#x2212;0.043 to &#x2212;0.014), 0.022 (0.011-0.034), and &#x2212;0.050 (&#x2212;0.070 to &#x2212;0.030) for GPT-token.</p></sec><sec id="s3-5"><title>Correlation Analysis</title><p>Feature attributions from SHAP and IG exhibit moderate correlation with each other, with a Pearson <italic>r</italic> of 0.367 (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="fig" rid="figure3">Figure 3</xref>). No notable correlation is evident between feature attributions from other pairs of explainers. Wasserstein distances reveal that the distributions of feature attributions are similar across all explainers.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Pairwise correlation and distribution similarity of token-level feature attributions generated by 4 explainers (Shapley Additive Explanations [SHAP] partition explainer, integrated gradients [IG], GPT-index, and GPT-token) across 80,901 tokens from 200 stratified studies sampled from the McMaster Premium Literature Service (PLUS) and Clinical Hedges databases (2003-2024), classified for methodological rigor using a fine-tuned BioLinkBERT model. Pearson <italic>r</italic>, Spearman <italic>&#x03C1;</italic>, and Kendall <italic>&#x03C4;</italic> assess linear and rank-based correlation, while the Wasserstein distance measures distributional similarity between attribution value distributions.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Explainer A</td><td align="left" valign="bottom">Explainer B</td><td align="left" valign="bottom">Pearson <italic>r</italic></td><td align="left" valign="bottom">Spearman <italic>&#x03C1;</italic></td><td align="left" valign="bottom">Kendall <italic>&#x03C4;</italic></td><td align="left" valign="bottom">Wasserstein distance</td></tr></thead><tbody><tr><td align="left" valign="top">SHAP</td><td align="left" valign="top">IG</td><td align="left" valign="top">0.367<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.275<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.192<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.002</td></tr><tr><td align="left" valign="top">SHAP</td><td align="left" valign="top">GPT-index</td><td align="left" valign="top">&#x2212;0.031<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.061<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.041<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.003</td></tr><tr><td align="left" valign="top">SHAP</td><td align="left" valign="top">GPT-token</td><td align="left" valign="top">0.004</td><td align="left" valign="top">0.037<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.025<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.003</td></tr><tr><td align="left" valign="top">IG</td><td align="left" valign="top">GPT-index</td><td align="left" valign="top">0.003</td><td align="left" valign="top">0.038<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.026<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.004</td></tr><tr><td align="left" valign="top">IG</td><td align="left" valign="top">GPT-token</td><td align="left" valign="top">0.032<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.029<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.020<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.005</td></tr><tr><td align="left" valign="top">GPT-index</td><td align="left" valign="top">GPT-token</td><td align="left" valign="top">0.083<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.096<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.071<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.001</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Statistical significance (<italic>P</italic>&#x003C;.05).</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Scatter plots of token-level feature attributions generated by 4 explainers (Shapley Additive Explanations [SHAP] partition explainer, integrated gradients [IG], GPT-index, and GPT-token) across 80,901 tokens from 200 stratified studies sampled from the McMaster Premium Literature Service (PLUS) and Clinical Hedges databases (2003-2024), classified for methodological rigor using a fine-tuned BioLinkBERT model. (A) SHAP and IG, (B) SHAP and GPT-index, (C) SHAP and GPT-token, (D) IG and GPT-index, (E) IG and GPT-token, (F) GPT-index and GPT-token.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81644_fig03.png"/></fig></sec><sec id="s3-6"><title>Feature Importance Attributions</title><p>Of the 80,901 generated feature attributions, 6369, 1073, and 87 were from unique tokens that had occurrences of &#x2265;1, &#x2265;10, and &#x2265;100, respectively. The most important unique tokens with &#x2265;10 occurrences can be found in <xref ref-type="fig" rid="figure4">Figure 4</xref>. Those with occurrences of &#x2265;1 and &#x2265;100 can be found in Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Among those with &#x2265;10 and &#x2265;100 occurrences, both SHAP and IG identified tokens that were associated with study designs, including &#x201C;cohort,&#x201D; &#x201C;pilot,&#x201D; &#x201C;exploratory,&#x201D; &#x201C;randomly,&#x201D; and &#x201C;blind,&#x201D; among others. In contrast, the GPT explainers did not exhibit a cohesive semantic pattern among tokens with &#x2265;10 occurrences. While GPT-token successfully identified select key clinical terms (eg, &#x201C;trial&#x201D; and &#x201C;randomized&#x201D;), it was unable to systematically isolate negatively contributing tokens.</p><p>Important tokens with 1 or more occurrence for SHAP and IG primarily consisted of terms related to study design, year, or topic. There is no consistent pattern between the 2 GPT explainers.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Accumulated local feature attributions of the identified most important negative and positive tokens with 10 or more occurrences, generated by 4 explainers (Shapley Additive Explanations [SHAP] partition explainer, integrated gradients [IG], GPT-index, and GPT-token) across 80,901 tokens from 200 stratified studies sampled from the McMaster Premium Literature Service (PLUS) and Clinical Hedges databases (2003 to 2024). The values are presented as mean (95% CI). (A) Negative tokens for SHAP, (B) positive tokens for SHAP, (C) negative tokens for IG, (D) positive tokens for IG, (E) negative tokens for GPT-index, (F) positive tokens for GPT-index, (G) negative tokens for GPT-token, and (H) positive tokens for GPT-token.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81644_fig04.png"/></fig></sec><sec id="s3-7"><title>Sensitivity Analysis</title><p>We conducted a sensitivity analysis, including only the correctly classified instances (70.5% accuracy and 141 studies). Of the 57,195 tokens, 5341, 816, and 57 were unique with an occurrence of &#x2265;1, &#x2265;10, and &#x2265;100, respectively. There was no notable change in the faithfulness of the explainers based on AOPC (Table S5 and Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), pairwise correlation of feature attributions among the explainers (Table S6 and Figure S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), and the most important tokens identified by accumulated feature attributions (Figures S5-S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>To our knowledge, this is the first experiment that attempts to leverage decoder transformers to establish feature attributions for text classifiers through perturbation. While our results do not indicate that GPT could be a potential substitute for conventional explanation methods in this context, this study nevertheless serves as a valuable exploratory analysis that could inspire future research in this area.</p><sec id="s4-1"><title>Principal Findings</title><p>While AOPC does not establish absolute faithfulness, it is a common method to compare the relative performance of explainers on the same model [<xref ref-type="bibr" rid="ref36">36</xref>]. Our results demonstrate that the SHAP partition explainer and IG were similar in their overall performance. SHAP was better at identifying negative tokens, while IG was better at identifying positive tokens. Our results also demonstrated that GPT was able to generate reasonable definitions of importance when provided with the task of generating feature attributions as an explainer. While delegating the definition of feature importance to the model itself theoretically risks inconsistency and could weaken methodological rigor, our analysis showed stability, with the model consistently defining importance as the change in predicted probability. This confirms that the poor performance of the GPT explainers is not a byproduct of an unstable metric definition. In spite of this, the GPT explainers struggled to generate reliable feature attributions. In particular, the negative AOPC for negative tokens indicates that the GPT explainers mistakenly associated negative attributions with features that increased the probability of rigor. A plausible explanation for this is a sign inversion error, wherein the model reports the raw negative delta of a masked positive feature rather than its intended importance magnitude. However, our generative prompts explicitly enforced a strict sign convention requiring positive floats for positive classifications and negative floats for negative classifications. An examination of the most important tokens reveals that the top negative features identified by GPT do not symmetrically align with the positive features identified by SHAP or IG. After sign inversion, AOPC decreased from 0.025 to &#x2212;0.019 for GPT-index, and from 0.029 to &#x2212;0.028 for GPT-token. Therefore, this discrepancy likely represents a limitation in GPT&#x2019;s semantic feature attribution logic rather than a sign inversion or delta reporting error. These findings were echoed by the correlation analyses, where attributions by SHAP and IG had a moderate correlation with each other, while the 2 GPT explainers had weak or no correlation with the others. Also, sensitivity analyses isolating only the correctly classified instances yielded similar trends, confirming that GPT&#x2019;s poor attribution performance is an inherent limitation of its logical reasoning rather than an artifact of attempting to explain confused or incorrect model predictions.</p><p>GPT&#x2019;s limited faithfulness is unlikely to be attributable to an undersampled perturbation space. The initial masking generated the same number of perturbations as the number of tokens. Subsequently, GPT proceeded with 10 iterations of masking, generating between 10 and 30 masking combinations per iteration, evaluating an additional 100 to 300 unique perturbations. In contrast, the SHAP partition explainer successfully established faithful baseline attributions using only 86 perturbations for a 512-token sequence. The fact that GPT evaluated a significantly larger subset of the perturbation space yet failed to produce aligned attributions indicates an inherent limitation in the LLM&#x2019;s ability to logically synthesize mathematical perturbation results, rather than a lack of search space exploration. Furthermore, it is important to emphasize that our results are established with a stratified sample of 200 instances and a specific prompting strategy. Therefore, our findings should be framed as a specific evaluation of this methodology on this dataset, rather than a definitive ruling on GPT&#x2019;s overall use for all biomedical text explanations.</p><p>While methods to examine the global attributions for transformer models are an area of active research [<xref ref-type="bibr" rid="ref37">37</xref>], we were able to examine the accumulated local attributions across all 200 instances. SHAP and IG indicate that the BioLinkBERT model generally aligned with the manual appraisal criteria [<xref ref-type="bibr" rid="ref38">38</xref>], with terms such as &#x201C;cohort,&#x201D; &#x201C;pilot,&#x201D; &#x201C;randomized,&#x201D; and &#x201C;blind,&#x201D; among others, being identified as the most important. The tokens identified by GPT did not align with SHAP or IG and seemed to be semantically nonsensical in the context of rigor classification. For instance, both GPT-index and GPT-token identified &#x201C;pilot&#x201D; as a positive contributor, contrary to manual appraisal as well as SHAP and IG explanations.</p></sec><sec id="s4-2"><title>Prompting</title><p>A challenge of this experiment was the development of prompts for GPT, considering the complex nature of generating feature attributions from perturbations. It is known that sophisticated prompting techniques can improve GPT&#x2019;s performance in NLP [<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref40">40</xref>]. In our study, we used numerous established techniques in prompt engineering in an attempt to improve performance, including role prompting, decomposition by providing instructions step by step, as well as chain-of-thought to a certain degree, with multiple iterations of perturbations and the redefinition of importance [<xref ref-type="bibr" rid="ref40">40</xref>]. GPT was also limited in responding with long, quantitative sequences despite explicit instructions and structured output restrictions [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. We mitigated this concern by explicitly instructing GPT to respond with a certain number of lists as parameters to the <italic>mask_and_predict</italic> function, using structured outputs and function calling, and decomposing the calculation steps to 20 tokens per batch. Despite this, GPT was not able to generate faithful attributions. Furthermore, we hypothesized that an advantage of LLMs would be the ability to recognize likely important tokens before any quantitative explanations have been generated, considering their ability to understand and encode contextualized information from plain text [<xref ref-type="bibr" rid="ref43">43</xref>]. Therefore, we experimented with 2 prompting schemes, namely GPT-index and GPT-token. However, our results show that there was no meaningful difference regardless of the inclusion of input tokens in the initial user prompt.</p></sec><sec id="s4-3"><title>Resource Requirements</title><p>A challenge with traditional XAI methods is the significant computational resources required. As previously mentioned, the exhaustive nature of calculating SHAP values from all possible perturbations is infeasible, resulting in the rise of numerous methods to approximate SHAP values [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>], including the partition explainer [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]. The computational requirement for IG is associated with integration steps. While higher steps result in higher precision, we found 30 steps to be feasible on GPUs with 32GB of memory and temporally more efficient than the SHAP partition explainer.</p><p>High computational costs and time delays were incurred due to the iterative approach with the GPT explainers. Similar to SHAP, the BioLinkBERT model must be queried to obtain predictions for the perturbed instances. Additionally, each subsequent prompt in the chain results in higher inference and response times due to network latency and the autoregressive nature of LLM text generation. Consequently, GPT was unequivocally the slowest method to generate explanations, while also incurring a direct financial cost from OpenAI&#x2019;s servers of approximately US $1.00 per instance.</p></sec><sec id="s4-4"><title>Deployment and Research Implications</title><p>Explainability and interpretability in biomedical and clinical machine learning are key areas of research [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref47">47</xref>]. As a pioneer in evidence-based medicine and knowledge translation, the McMaster Health Information Research Unit aims not only to automate biomedical literature classification and appraisal [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref48">48</xref>] but also to ensure that the process is transparent and reproducible to facilitate trust among clinicians who subscribe to PLUS and PLUS-associated services. Based on the results of this experiment, we believe that both SHAP and IG would be suitable for deployment alongside a top-performing model. More recently, studies [<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>] and systematic review support systems [<xref ref-type="bibr" rid="ref51">51</xref>-<xref ref-type="bibr" rid="ref53">53</xref>] have begun to leverage supervised or active learning extensively to support knowledge translation and synthesis by relevance ranking or automatic classification. We believe that systems should attempt to integrate XAI frameworks alongside any black-box models for better transparency.</p><p>While we did not obtain promising results in using GPT as an end-to-end approach for feature attributions, our work nevertheless serves as a foundation for future research. Given the sensitivity of GPT-based explanations to prompt design, future studies could explore more sophisticated, domain-tailored prompting strategies and iterative prompt refinement using techniques, such as few-shot learning, to better align GPT&#x2019;s output with domain-specific interpretability criteria [<xref ref-type="bibr" rid="ref40">40</xref>]. Fine-tuning LLM explainers on biomedical corpora could also improve their understanding of specialized terminology and context [<xref ref-type="bibr" rid="ref54">54</xref>]. Hybrid explanation frameworks, such as leveraging LLMs to establish a partition hierarchy [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref30">30</xref>] or integrating model-internal signals, such as attention weights, with LLM-based explanation methods, may also be of interest [<xref ref-type="bibr" rid="ref55">55</xref>-<xref ref-type="bibr" rid="ref57">57</xref>]. Specifically, future proof-of-concept studies should investigate whether grounding LLM-generated contextual explanations in traditional feature attributions, such as SHAP or IG, can produce more faithful and human-interpretable results than standalone generative explainers.</p></sec><sec id="s4-5"><title>Strengths and Limitations</title><p>Our study has several strengths. First, to our knowledge, this is the first experiment that attempts to leverage decoder transformers to establish feature attributions for text classifiers by perturbation. Second, a concern with leveraging LLMs in medical research is reproducibility, as evidence-based medicine is founded upon concepts of transparency, reliability, and the ability to validate findings through rigorous, repeatable methodologies [<xref ref-type="bibr" rid="ref58">58</xref>-<xref ref-type="bibr" rid="ref61">61</xref>]. We mitigated this concern by using a temperature of 0, making the outputs of the LLM deterministic and replicable. Third, we mitigated concerns with the original AOPC metric on binary text classification by separately considering negative and positive features. This allowed us to better capture the faithfulness of the explanations. Fourth, we leveraged sophisticated prompting techniques for GPT. This indicates that the poor results from GPT are likely an inherent limitation of the pretraining and the model architecture rather than the prompt.</p><p>Nevertheless, important limitations must be considered when interpreting our results. First, there is no known method to establish ground truth in black-box models, and explaining text models with a high feature space remains a challenge [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]. Consequently, SHAP and IG were used as established comparative baselines rather than definitive ground truths, and AOPC was used as an objective proxy for faithfulness. For IG specifically, the &#x201C;[PAD]&#x201D; baseline yields a predicted probability of 11.7% for the positive class, compared to the class prevalence of 41.5% in the stratified subset, indicating that the baseline is not prediction-neutral. This means IG attributions reflect token contributions relative to a negatively-biased starting point, which may systematically inflate the apparent importance of tokens that are most associated with the positive class. For these reasons, it is important to note that our findings are context-specific, dataset-specific, and model-specific. Second, due to resource constraints, we could only experiment with a stratified subset of 200 instances from the original dataset. While we attempted to minimize sampling bias through stratified sampling, this limited sample size may restrict the generalizability of our findings, particularly regarding the correlation analysis. However, because the unit of analysis for feature attribution is the token (N=80,901), our study retains robust statistical power to evaluate explainer behavior within this sample, as evidenced by our narrow CI. Nonetheless, a larger dataset would further increase our confidence. Third, word-piece tokenization often separates words into fragments, potentially affecting how feature attributions are assigned [<xref ref-type="bibr" rid="ref63">63</xref>]. This mismatch between models may have contributed to the poor performance of the GPT-token scheme, forcing a generative LLM to reason over another model&#x2019;s disjointed, comma-separated word-piece tokens rather than its native text processing. Consequently, the explanations may not correspond to human-interpretable linguistic units, especially for numerical texts. However, the negligible performance difference between GPT-token and GPT-index, which was only provided the number of maskable tokens, indicates that tokenization is likely not the primary contributor to explanation faithfulness. Nevertheless, as a language model, GPT may be limited in accurately mapping tokens from a long, comma-separated list of numerical indices. Therefore, future research should investigate whether recombining these subword tokens into whole words prior to the LLM explanation phase improves semantic alignment and attribution faithfulness. Finally, GPT&#x2019;s performance on a task is heavily prompt-specific. While our methodology used highly sophisticated prompt engineering techniques, our evaluation was strictly zero-shot to test the model&#x2019;s baseline reasoning. The substantial API costs associated with iterative perturbation precluded us from conducting comprehensive ablation studies or providing few-shot examples. Furthermore, both SHAP and IG are of a zero-shot nature, and using zero-shot for GPT allows for a more robust comparison. It remains unknown whether GPT would show promise with a different set of prompts, and this is a critical area for future investigation.</p></sec><sec id="s4-6"><title>Conclusions</title><p>We conducted a comprehensive proof-of-concept exploration into the application of GPT-4o as a standalone, end-to-end perturbation explainer for a BioLinkBERT biomedical text classifier. Our objective was to compare the faithfulness of GPT-driven explanations against established baseline methods, specifically the SHAP partition explainer and IG. The results demonstrated that while SHAP and IG provided consistent and relatively faithful feature attributions, the GPT-based approaches, regardless of whether they were prompted with token indices or explicit subword tokens, yielded poor explanations. This was evidenced by near-zero correlation with established methods and counterintuitive token attributions. Consequently, the findings of this study indicate that despite advanced contextual capabilities, current generative LLMs struggle to accurately synthesize mathematical feature importance through iterative masking, lacking the reliability of traditional XAI frameworks for this specific task. Despite these limitations, our work offers valuable insights and establishes a foundation for future research aimed at integrating LLMs into the explainability framework.</p></sec></sec></body><back><ack><p>The authors thank the Digital Research Alliance of Canada for the computational resources support.</p></ack><notes><sec><title>Funding</title><p>FZ was funded through the Mitacs Business Strategy Internship grant (IT42947) with matching funds from EBSCO. The use of GPT-4o was funded by credits through the OpenAI Researcher Access Program (0000014443). The funders were not involved in the conceptualization, conduction, or dissemination of the project.</p></sec><sec><title>Data Availability</title><p>The code is available in a public GitHub repository [<xref ref-type="bibr" rid="ref64">64</xref>]. All data and analyses supporting the findings of this study are available from the first or corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: FZ, AS, MA, CL</p><p>Data curation: FZ, RP, CL</p><p>Formal analysis: FZ</p><p>Funding acquisition: FZ, CL</p><p>Investigation: FZ, AS, MA, RP, AI, CL</p><p>Methodology: FZ, AS, MA, RP, CL</p><p>Project administration: CL</p><p>Resources: FZ, AI, CL</p><p>Software: FZ, RP</p><p>Supervision: AS, MA, CL</p><p>Validation: FZ, AS, MA, RP, RBH, CL</p><p>Visualization: FZ</p><p>Writing &#x2013; original draft: FZ, CL</p><p>Writing &#x2013; review &#x0026; editing: FZ, AS, MA, RP, RBH, AI, CL</p></fn><fn fn-type="conflict"><p>McMaster University, a nonprofit public academic institution, operates contracts through the Health Information Research Unit under the supervision of AI and RBH. These contracts involve professional and commercial publishers to provide newly published studies and systematic reviews, which are critically appraised for research methodology and assessed for clinical relevance as part of McMaster Premium Literature Service (PLUS). CL and RP receive partial compensation through these contracts, and RBH is remunerated for supervisory responsibilities and royalties. AS, FZ, and MA have no affiliations with McMaster PLUS.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AOPC</term><def><p>area over the perturbation curve</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb4">IG</term><def><p>integrated gradients</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb7">PLUS</term><def><p>Premium Literature Service</p></def></def-item><def-item><term id="abb8">SHAP</term><def><p>Shapley Additive Explanations</p></def></def-item><def-item><term id="abb9">XAI</term><def><p>explainable artificial intelligence</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>MEDLINE PubMed production statistics</article-title><source>National Library of Medicine</source><year>2018</year><access-date>2025-01-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nlm.nih.gov/bsd/medline_pubmed_production_stats.html">https://www.nlm.nih.gov/bsd/medline_pubmed_production_stats.html</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>BLURB leaderboard</article-title><source>BLURB</source><access-date>2024-08-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://microsoft.github.io/BLURB/leaderboard.html">https://microsoft.github.io/BLURB/leaderboard.html</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names></name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><access-date>2026-05-03</access-date><conf-name>2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2019)</conf-name><conf-date>Jun 2-7, 2019</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/N19-1423.pdf">https://aclanthology.org/N19-1423.pdf</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><access-date>2026-05-13</access-date><conf-name>31st Conference on Neural Information Processing Systems (NIPS 2017)</conf-name><conf-date>Dec 4-9, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wadden</surname><given-names>JJ</given-names> </name></person-group><article-title>Defining the undefinable: the black box problem in healthcare artificial intelligence</article-title><source>J Med Ethics</source><year>2022</year><month>09</month><day>28</day><volume>48</volume><issue>10</issue><fpage>764</fpage><pub-id pub-id-type="doi">10.1136/medethics-2021-107529</pub-id><pub-id pub-id-type="medline">34290113</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gohel</surname><given-names>P</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mohanty</surname><given-names>M</given-names> </name></person-group><article-title>Explainable AI: current status and future directions</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 12, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2107.07045</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lundberg</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SI</given-names> </name></person-group><article-title>A unified approach to interpreting model predictions</article-title><access-date>2026-05-13</access-date><conf-name>31st Conference on Neural Information Processing Systems (NIPS 2017)</conf-name><conf-date>Dec 4-9, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf">https://proceedings.neurips.cc/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bertossi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schleich</surname><given-names>M</given-names> </name><name name-style="western"><surname>Suciu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Vagena</surname><given-names>Z</given-names> </name></person-group><article-title>Causality-based explanation of classification outcomes</article-title><access-date>2026-05-13</access-date><conf-name>International Workshop on Data Management for End-to-End Machine Learning (DEEM&#x2019;20)</conf-name><conf-date>Jun 14, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://people.scs.carleton.ca/~bertossi/papers/DeemWSCamReady.pdf">https://people.scs.carleton.ca/~bertossi/papers/DeemWSCamReady.pdf</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>L&#x00F3;pez</surname><given-names>S</given-names> </name><name name-style="western"><surname>Saboya</surname><given-names>M</given-names> </name></person-group><article-title>On the relationship between Shapley and Owen values</article-title><source>Cent Eur J Oper Res</source><year>2009</year><month>12</month><volume>17</volume><issue>4</issue><fpage>415</fpage><lpage>423</lpage><pub-id pub-id-type="doi">10.1007/s10100-009-0100-8</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Enguehard</surname><given-names>J</given-names> </name></person-group><article-title>Sequential integrated gradients: a simple but effective method for explaining language models</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Jul 9-14, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.findings-acl.477</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sikdar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bhattacharya</surname><given-names>P</given-names> </name><name name-style="western"><surname>Heese</surname><given-names>K</given-names> </name></person-group><article-title>Integrated directional gradients: feature interaction attribution for neural NLP models</article-title><conf-name>The 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing</conf-name><conf-date>Aug 1-6, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.71</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ribeiro</surname><given-names>M</given-names> </name><name name-style="western"><surname>Malcorra</surname><given-names>B</given-names> </name><name name-style="western"><surname>Mota</surname><given-names>NB</given-names> </name><etal/></person-group><article-title>A methodology for explainable large language models with integrated gradients and linguistic analysis in text classification</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 30, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.00250</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Aumann</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Shapley</surname><given-names>LS</given-names> </name></person-group><source>Values of Non-Atomic Games</source><year>1974</year><access-date>2025-01-20</access-date><publisher-name>Princeton University Press</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.semanticscholar.org/paper/Values-of-Non-Atomic-Games-Aumann-Shapley/59e35288e6c252ac1a9d8e1ad359b82722792e9a">https://www.semanticscholar.org/paper/Values-of-Non-Atomic-Games-Aumann-Shapley/59e35288e6c252ac1a9d8e1ad359b82722792e9a</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sundararajan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Taly</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>Q</given-names> </name></person-group><article-title>Axiomatic attribution for deep networks</article-title><access-date>2026-05-13</access-date><conf-name>Proceedings of the 34th International Conference on Machine Learning (ICML 2017)</conf-name><conf-date>Aug 6-11, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v70/sundararajan17a/sundararajan17a.pdf">https://proceedings.mlr.press/v70/sundararajan17a/sundararajan17a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mosca</surname><given-names>E</given-names> </name><name name-style="western"><surname>Szigeti</surname><given-names>F</given-names> </name><name name-style="western"><surname>Tragianni</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gallagher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Groh</surname><given-names>G</given-names> </name></person-group><article-title>SHAP-based explanation methods: a review for NLP interpretability</article-title><access-date>2026-05-13</access-date><conf-name>Proceedings of the 29th International Conference on Computational Linguistics</conf-name><conf-date>Oct 12-17, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.coling-1.406.pdf">https://aclanthology.org/2022.coling-1.406.pdf</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>Y</given-names> </name></person-group><article-title>Generating hierarchical explanations on text classification via feature interaction detection</article-title><access-date>2026-05-13</access-date><conf-name>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 5-10, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2020.acl-main.494.pdf">https://aclanthology.org/2020.acl-main.494.pdf</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Minaee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nikzad</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Large language models: a survey</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 9, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.06196</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zytek</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pid&#x00F2;</surname><given-names>S</given-names> </name><name name-style="western"><surname>Veeramachaneni</surname><given-names>K</given-names> </name></person-group><article-title>LLMs for XAI: future directions for explaining explanations</article-title><source>arXiv</source><comment>Preprint posted online on  May 9, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.06064</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zytek</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pido</surname><given-names>S</given-names> </name><name name-style="western"><surname>Alnegheimish</surname><given-names>S</given-names> </name><name name-style="western"><surname>Berti-&#x00C9;quille</surname><given-names>L</given-names> </name><name name-style="western"><surname>Veeramachaneni</surname><given-names>K</given-names> </name></person-group><article-title>Explingo: explaining AI predictions using large language models</article-title><conf-name>2024 IEEE International Conference on Big Data (BigData)</conf-name><conf-date>Dec 15-18, 2024</conf-date><pub-id pub-id-type="doi">10.1109/BigData62323.2024.10825114</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zeng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>K</given-names> </name></person-group><article-title>Enhancing the interpretability of SHAP values using large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 24, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.00079</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><source>Claude</source><access-date>2025-02-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/api">https://www.anthropic.com/api</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><source>OpenAI</source><access-date>2025-02-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/openai-api/">https://openai.com/index/openai-api/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>Llama API</article-title><source>Meta</source><access-date>2025-02-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.llama.com/products/llama-api/">https://www.llama.com/products/llama-api/</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>F</given-names> </name><name name-style="western"><surname>Parrish</surname><given-names>R</given-names> </name><name name-style="western"><surname>Afzal</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Benchmarking domain-specific pretrained language models to identify the best model for methodological rigor in clinical studies</article-title><source>J Biomed Inform</source><year>2025</year><month>06</month><volume>166</volume><fpage>104825</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2025.104825</pub-id><pub-id pub-id-type="medline">40246186</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lokker</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bagheri</surname><given-names>E</given-names> </name><name name-style="western"><surname>Abdelkader</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Deep learning to refine the identification of high-quality clinical research articles from the biomedical literature: performance evaluation</article-title><source>J Biomed Inform</source><year>2023</year><month>06</month><volume>142</volume><fpage>104384</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2023.104384</pub-id><pub-id pub-id-type="medline">37164244</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wilczynski</surname><given-names>NL</given-names> </name><name name-style="western"><surname>Morgan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Haynes</surname><given-names>RB</given-names> </name><collab>Hedges Team</collab></person-group><article-title>An overview of the design and methods for retrieving high-quality studies for clinical care</article-title><source>BMC Med Inform Decis Mak</source><year>2005</year><month>06</month><day>21</day><volume>5</volume><issue>1</issue><fpage>20</fpage><pub-id pub-id-type="doi">10.1186/1472-6947-5-20</pub-id><pub-id pub-id-type="medline">15969765</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haynes</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cotoi</surname><given-names>C</given-names> </name><etal/></person-group><article-title>McMaster PLUS: a cluster randomized clinical trial of an intervention to accelerate clinical use of evidence-based information from digital libraries</article-title><source>J Am Med Inform Assoc</source><year>2006</year><volume>13</volume><issue>6</issue><fpage>593</fpage><lpage>600</lpage><pub-id pub-id-type="doi">10.1197/jamia.M2158</pub-id><pub-id pub-id-type="medline">16929034</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><source>MCMASTER+</source><access-date>2024-08-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://plus.mcmaster.ca/McMasterPLUSDB/">https://plus.mcmaster.ca/McMasterPLUSDB/</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>Methodological criteria</article-title><source>Health Information Research Unit</source><access-date>2024-08-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://hiruweb.mcmaster.ca/hkr/what-we-do/methodologic-criteria/">https://hiruweb.mcmaster.ca/hkr/what-we-do/methodologic-criteria/</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>shap.PartitionExplainer</article-title><source>SHAP Documentation</source><access-date>2024-12-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://shap.readthedocs.io/en/latest/generated/shap.PartitionExplainer.html">https://shap.readthedocs.io/en/latest/generated/shap.PartitionExplainer.html</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bitton</surname><given-names>R</given-names> </name><name name-style="western"><surname>Malach</surname><given-names>A</given-names> </name><name name-style="western"><surname>Meiseles</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Latent SHAP: toward practical human-interpretable explanations</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 27, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2211.14797</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Function calling</article-title><source>OpenAI Developers</source><access-date>2025-02-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/guides/function-calling">https://platform.openai.com/docs/guides/function-calling</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>Structured model outputs</article-title><source>OpenAI Developers</source><access-date>2025-02-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/guides/structured-outputs">https://platform.openai.com/docs/guides/structured-outputs</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>D</given-names> </name></person-group><article-title>Comparing automatic and human evaluation of local explanations for text classification</article-title><conf-name>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)</conf-name><conf-date>Jun 1-6, 2018</conf-date><pub-id pub-id-type="doi">10.18653/v1/N18-1097</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Samek</surname><given-names>W</given-names> </name><name name-style="western"><surname>Binder</surname><given-names>A</given-names> </name><name name-style="western"><surname>Montavon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lapuschkin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Muller</surname><given-names>KR</given-names> </name></person-group><article-title>Evaluating the visualization of what a deep neural network has learned</article-title><source>IEEE Trans Neural Netw Learn Syst</source><year>2017</year><month>11</month><volume>28</volume><issue>11</issue><fpage>2660</fpage><lpage>2673</lpage><pub-id pub-id-type="doi">10.1109/TNNLS.2016.2599820</pub-id><pub-id pub-id-type="medline">27576267</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Edin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Motzfeldt</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Christensen</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Ruotsalo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Maal&#x00F8;e</surname><given-names>L</given-names> </name><name name-style="western"><surname>Maistro</surname><given-names>M</given-names> </name></person-group><article-title>Normalized AOPC: fixing misleading faithfulness metrics for feature attributions explainability</article-title><conf-name>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name><conf-date>Jul 27 to Aug 1, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.86</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Covert</surname><given-names>I</given-names> </name><name name-style="western"><surname>Lundberg</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SI</given-names> </name></person-group><article-title>Understanding global feature contributions with additive importance measures</article-title><access-date>2026-05-13</access-date><conf-name>34th Conference on Neural Information Processing Systems (NeurIPS 2020)</conf-name><conf-date>Dec 6-12, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://papers.neurips.cc/paper_files/paper/2020/file/c7bf0b7c1a86d5eb3be2c722cf2cf746-Paper.pdf">https://papers.neurips.cc/paper_files/paper/2020/file/c7bf0b7c1a86d5eb3be2c722cf2cf746-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kelley</surname><given-names>M</given-names> </name><name name-style="western"><surname>Samolyk-Mazzanti</surname><given-names>A</given-names> </name><name name-style="western"><surname>Visweswaran</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>An empirical evaluation of prompting strategies for large language models in zero-shot clinical natural language processing: algorithm development and validation study</article-title><source>JMIR Med Inform</source><year>2024</year><month>04</month><day>8</day><volume>12</volume><fpage>e55318</fpage><pub-id pub-id-type="doi">10.2196/55318</pub-id><pub-id pub-id-type="medline">38587879</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="web"><article-title>Prompt engineering</article-title><source>OpenAI Developers</source><access-date>2025-03-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/guides/prompt-engineering">https://platform.openai.com/docs/guides/prompt-engineering</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Schulhoff</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ilie</surname><given-names>M</given-names> </name><name name-style="western"><surname>Balepur</surname><given-names>N</given-names> </name><etal/></person-group><article-title>The prompt report: a systematic survey of prompt engineering techniques</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.06608</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lv</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>GPT can solve mathematical problems without a calculator</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 6, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.03241</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yuan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>S</given-names> </name></person-group><article-title>How well do large language models perform in arithmetic tasks?</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 16, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.02015</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>BehnamGhader</surname><given-names>P</given-names> </name><name name-style="western"><surname>Adlakha</surname><given-names>V</given-names> </name><name name-style="western"><surname>Mosbach</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bahdanau</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chapados</surname><given-names>N</given-names> </name><name name-style="western"><surname>Reddy</surname><given-names>S</given-names> </name></person-group><article-title>LLM2Vec: large language models are secretly powerful text encoders</article-title><access-date>2026-05-13</access-date><conf-name>COLM 2024 (Conference on Language Modeling)</conf-name><conf-date>Nov 6-9, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf?id=IW1PR7vEBf">https://openreview.net/pdf?id=IW1PR7vEBf</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name></person-group><article-title>Fast treeshap: accelerating SHAP value computation for trees</article-title><access-date>2026-05-13</access-date><conf-name>1st Workshop on eXplainable AI approaches for debugging and diagnosis (XAI4Debugging@NeurIPS2021)</conf-name><conf-date>Dec 6-14, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://xai4debugging.github.io/files/papers/fast_treeshap_accelerating_sha.pdf">https://xai4debugging.github.io/files/papers/fast_treeshap_accelerating_sha.pdf</ext-link></comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="web"><article-title>shap.LinearExplainer</article-title><source>SHAP Documentation</source><access-date>2025-03-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://shap.readthedocs.io/en/latest/generated/shap.LinearExplainer.html">https://shap.readthedocs.io/en/latest/generated/shap.LinearExplainer.html</ext-link></comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marcus</surname><given-names>E</given-names> </name><name name-style="western"><surname>Teuwen</surname><given-names>J</given-names> </name></person-group><article-title>Artificial intelligence and explanation: how, why, and when to explain black boxes</article-title><source>Eur J Radiol</source><year>2024</year><month>04</month><volume>173</volume><issue>111393</issue><fpage>111393</fpage><pub-id pub-id-type="doi">10.1016/j.ejrad.2024.111393</pub-id><pub-id pub-id-type="medline">38417186</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ye</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>J</given-names> </name></person-group><article-title>Unbox the black-box for the medical explainable AI via multi-modal and multi-centre data fusion: a mini-review, two showcases and beyond</article-title><source>Inf Fusion</source><year>2022</year><month>01</month><volume>77</volume><fpage>29</fpage><lpage>52</lpage><pub-id pub-id-type="doi">10.1016/j.inffus.2021.07.016</pub-id><pub-id pub-id-type="medline">34980946</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lokker</surname><given-names>C</given-names> </name><name name-style="western"><surname>Abdelkader</surname><given-names>W</given-names> </name><name name-style="western"><surname>Bagheri</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Boosting efficiency in a clinical literature surveillance system with LightGBM</article-title><source>PLoS Digit Health</source><year>2024</year><month>09</month><volume>3</volume><issue>9</issue><fpage>e0000299</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000299</pub-id><pub-id pub-id-type="medline">39312500</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dias</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Moreira</surname><given-names>VP</given-names> </name><name name-style="western"><surname>Comba</surname><given-names>JLD</given-names> </name></person-group><article-title>RoBIn: a Transformer-based model for risk of bias inference with machine reading comprehension</article-title><source>J Biomed Inform</source><year>2025</year><month>06</month><volume>166</volume><fpage>104819</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2025.104819</pub-id><pub-id pub-id-type="medline">40250743</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>Kuiper</surname><given-names>J</given-names> </name><name name-style="western"><surname>Banner</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>BC</given-names> </name></person-group><article-title>Automating biomedical evidence synthesis: robotreviewer</article-title><conf-name>Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics-System Demonstration</conf-name><conf-date>Jul 30 to Aug 4, 2017</conf-date><pub-id pub-id-type="doi">10.18653/v1/P17-4002</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="web"><article-title>Machine learning&#x2014;the game changer for trustworthy evidence</article-title><source>Covidence</source><year>2023</year><access-date>2025-03-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.covidence.org/blog/machine-learning-the-game-changer-for-trustworthy-evidence/">https://www.covidence.org/blog/machine-learning-the-game-changer-for-trustworthy-evidence/</ext-link></comment></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="web"><article-title>DistillerSR AI&#x2014;scientifically validated AI</article-title><source>DistillerSR</source><year>2023</year><access-date>2025-03-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.distillersr.com/products/distillersrai">https://www.distillersr.com/products/distillersrai</ext-link></comment></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="web"><source>Rayyan</source><access-date>2025-03-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.rayyan.ai/">https://www.rayyan.ai/</ext-link></comment></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>BioGPT: generative pre-trained transformer for biomedical text generation and mining</article-title><source>Brief Bioinform</source><year>2022</year><month>11</month><day>19</day><volume>23</volume><issue>6</issue><fpage>bbac409</fpage><pub-id pub-id-type="doi">10.1093/bib/bbac409</pub-id><pub-id pub-id-type="medline">36156661</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Waghela</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rakshit</surname><given-names>S</given-names> </name></person-group><article-title>Saliency attention and semantic similarity-driven adversarial perturbation</article-title><conf-name>5th International Conference on Data Science and its Applications (ICDSA&#x2019;24)</conf-name><conf-date>Jul 17-19, 2024</conf-date><pub-id pub-id-type="doi">10.36227/techrxiv.172047313.38449559/v1</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ntrougkas</surname><given-names>MV</given-names> </name><name name-style="western"><surname>Mezaris</surname><given-names>V</given-names> </name><name name-style="western"><surname>Patras</surname><given-names>I</given-names> </name></person-group><article-title>P-TAME: explain any image classifier with trained perturbations</article-title><source>IEEE Open J Signal Process</source><year>2025</year><month>01</month><day>29</day><volume>6</volume><fpage>536</fpage><lpage>545</lpage><pub-id pub-id-type="doi">10.1109/OJSP.2025.3568756</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>Q</given-names> </name></person-group><article-title>Perturbation-based self-supervised attention for attention bias in text classification</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2023</year><volume>31</volume><fpage>3139</fpage><lpage>3151</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2023.3302230</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mete</surname><given-names>U</given-names> </name><name name-style="western"><surname>&#x00D6;zmen</surname><given-names>&#x00D6;A</given-names> </name></person-group><article-title>Assessing the accuracy and reproducibility of ChatGPT for responding to patient inquiries about otosclerosis</article-title><source>Eur Arch Otorhinolaryngol</source><year>2025</year><month>03</month><volume>282</volume><issue>3</issue><fpage>1567</fpage><lpage>1575</lpage><pub-id pub-id-type="doi">10.1007/s00405-024-09039-4</pub-id><pub-id pub-id-type="medline">39461921</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Heybati</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shammas-Toma</surname><given-names>M</given-names> </name></person-group><article-title>When vision meets reality: exploring the clinical applicability of GPT-4 with vision</article-title><source>Clin Imaging</source><year>2024</year><month>04</month><volume>108</volume><fpage>110101</fpage><pub-id pub-id-type="doi">10.1016/j.clinimag.2024.110101</pub-id><pub-id pub-id-type="medline">38341880</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Davis</surname><given-names>J</given-names> </name><name name-style="western"><surname>Van Bulck</surname><given-names>L</given-names> </name><name name-style="western"><surname>Durieux</surname><given-names>BN</given-names> </name><name name-style="western"><surname>Lindvall</surname><given-names>C</given-names> </name></person-group><article-title>The temperature feature of ChatGPT: modifying creativity for clinical research</article-title><source>JMIR Hum Factors</source><year>2024</year><month>03</month><day>8</day><volume>11</volume><fpage>e53559</fpage><pub-id pub-id-type="doi">10.2196/53559</pub-id><pub-id pub-id-type="medline">38457221</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>National Academies of Sciences, Engineering, and Medicine, Policy and Global Affairs, Committee on Science, Engineering, Medicine, and Public Policy, Board on Research Data and Information, Division on Engineering and Physical Sciences, Committee on Applied and Theoretical Statistics, Board on Mathematical Sciences and Analytics, Division on Earth and Life Studies, Nuclear and Radiation Studies Board, Division of Behavioral and Social Sciences and Education, Committee on National Statistics, Board on Behavioral, Cognitive, and Sensory Sciences, Committee on Reproducibility and Replicability in Science</collab></person-group><article-title>Understanding reproducibility and replicability</article-title><source>Reproducibility and Replicability in Science</source><year>2019</year><access-date>2025-03-02</access-date><publisher-name>National Academies Press</publisher-name><fpage>39</fpage><lpage>54</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK547546">https://www.ncbi.nlm.nih.gov/books/NBK547546</ext-link></comment></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Melamed</surname><given-names>O</given-names> </name><name name-style="western"><surname>Caruana</surname><given-names>R</given-names> </name></person-group><article-title>Explaining high-dimensional text classifiers</article-title><access-date>2026-05-13</access-date><conf-name>XAI in Action: Past, Present, and Future Applications</conf-name><conf-date>Dec 16, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://neurips.cc/virtual/2023/75167">https://neurips.cc/virtual/2023/75167</ext-link></comment></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>X</given-names> </name><name name-style="western"><surname>Salcianu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Dopson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>D</given-names> </name></person-group><article-title>Fast wordpiece tokenization</article-title><conf-name>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 7-11, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2021.emnlp-main.160.pdf">https://aclanthology.org/2021.emnlp-main.160.pdf</ext-link></comment></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>F</given-names> </name></person-group><article-title>Zhfwen/rct_gpt_perturbation</article-title><source>Github</source><access-date>2026-04-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/zhfwen/rct_gpt_perturbation">https://github.com/zhfwen/rct_gpt_perturbation</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompt definitions, software environments, and additional analyses.</p><media xlink:href="medinform_v14i1e81644_app1.docx" xlink:title="DOCX File, 2133 KB"/></supplementary-material></app-group></back></article>