<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e77943</article-id><article-id pub-id-type="doi">10.2196/77943</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of Zero-Shot Classifiers for Categorizing RCT Abstracts by Intervention Type: Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Buitrago-Garcia</surname><given-names>Diana</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Courvoisier</surname><given-names>Delphine S</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Capderou</surname><given-names>Sami</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Iudici</surname><given-names>Michele</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Mongin</surname><given-names>Denis</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Division of Rheumatology, Geneva University Hospitals and University of Geneva</institution><addr-line>Rue Alcide-Jentzer 22</addr-line><addr-line>Gen&#x00E8;ve</addr-line><country>Switzerland</country></aff><aff id="aff2"><institution>Surgery Department, Geneva University Hospitals and University of Geneva, Geneva</institution><addr-line>Geneva</addr-line><country>Switzerland</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Kotze</surname><given-names>Eduan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kuppan</surname><given-names>Karthigeyan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Yao</surname><given-names>Li-Hung</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Windisch</surname><given-names>Paul</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Denis Mongin, PhD, Division of Rheumatology, Geneva University Hospitals and University of Geneva, Rue Alcide-Jentzer 22, Gen&#x00E8;ve, 1205, Switzerland, 41 022 372 36 78; <email>denis.mongin@hug.ch</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>18</day><month>6</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e77943</elocation-id><history><date date-type="received"><day>22</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>04</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>05</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Diana Buitrago-Garcia, Delphine S Courvoisier, Sami Capderou, Michele Iudici, Denis Mongin. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 18.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e77943"/><abstract><sec><title>Background</title><p>Artificial intelligence has gained relevance due to its potential to reduce the workload in evidence synthesis or bibliometric projects. While the main focus has been lately on the use of instruction-tuned large language models, zero-shot classification models have not been tested for such task. These models are large language models trained on large datasets of labeled data able to categorize text among proposed labels, irrespective of the text domain or the topic. They are relatively small, able to run on consumer-grade computers, and almost hyperparameter-free.</p></sec><sec><title>Objective</title><p>In our study, we use abstracts of randomized clinical trials in rheumatology as a case example to evaluate the performance of openly available, generalist, zero-shot classification models in classifying types of interventions against a human gold standard.</p></sec><sec sec-type="methods"><title>Methods</title><p>We classified all rheumatology RCT abstracts published between 2009 and 2022 (n=1,054) as &#x201C;drug&#x201D; or &#x201C;non-drug&#x201D; using two zero-shot text classification models (DeBERTa and BART) and few-shot prompting using Llama3 8B. Different labeling of categories provided to the zero-shot classification models and different prompts provided to Llama3 8B were tested. Performance was evaluated using accuracy and predictive value of both categories against a human gold standard.</p></sec><sec sec-type="results"><title>Results</title><p>Most randomized controlled trials, RCTs (452/1054, 42.9%) assessed drug interventions. The DeBERTa model achieved the highest accuracy (929/1054, 88.1%; 95% CI 86%&#x2010;90%) when using the &#x201C;drug&#x201D; and &#x201C;non-drug&#x201D; labels. Llama3 8B and few-shot prompting had slightly higher accuracy and predictive values. Both zero-shot and Llama3 8B models had performance on par with a human without experience in evidence synthesis (905/1054, 85.9%; 95% CI 83.6%&#x2010;87.8% accuracy). Misclassifications occurred for trials where the intervention was harder to classify, such as procedures (eg, intra-articular injections), food compounds, vitamins, supplements, or biological treatments.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study shows the potential of zero-shot classification models for simple classification tasks, demonstrating accuracy comparable to that of an untrained human. These models are potential tools to streamline systematic review tasks for bibliometric studies in classifying abstracts by supplementing one reviewer.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>LLM</kwd><kwd>automation tools</kwd><kwd>evidence synthesis</kwd><kwd>methodology</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The study of scientific publications is at the core of bibliometric research on evolving trends [<xref ref-type="bibr" rid="ref1">1</xref>] and supports evidence synthesis addressing key questions in clinical and public health decision-making [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Abstract classification, the first step in many review processes, typically requires two independent reviewers to minimize errors and bias, with a third resolving disagreements [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. This procedure is time-consuming [<xref ref-type="bibr" rid="ref5">5</xref>] and creates a considerable burden for review teams in projects addressing broad questions.</p><p>Advances in artificial intelligence (AI) have introduced new opportunities to reduce workload, especially for abstract and title screening tasks [<xref ref-type="bibr" rid="ref6">6</xref>]. Two main categories of AI tools have been used to date. The first includes machine learning algorithms trained directly for a specific classification task. This approach has demonstrated good and reproducible performance [<xref ref-type="bibr" rid="ref7">7</xref>] but it lacks versatility and requires task-specific training. The second category involves prompting large language models [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. While they are highly versatile and not dependent on task-specific training, their performance can vary depending on the prompts provided, sampling parameters, and their use requires substantial computing power, or a paid private Application Programming Interface (API).</p><p>Zero-shot classification models [<xref ref-type="bibr" rid="ref10">10</xref>] represent a third, less explored option with potential advantages. These language models, constrained to assigning a probability to a label for a given text, are fine-tuned on large datasets of labeled data. They subsequently develop natural language inference capabilities allowing them to categorize text into user-defined labels without domain-specific training. Although they can be further fine-tuned, we focus here on generalist zero-shot models that are widely available and not trained on a specific domain. They are simple to use, require minimal computational resources, have no tunable input hyperparameters beyond the label set, and have broad applicability. They therefore offer both the versatility required for abstract screening and the reproducibility needed in research. Previous studies have reported encouraging results in classifying texts [<xref ref-type="bibr" rid="ref10">10</xref>] and heterogeneous results have been reported for abstract screening in systematic reviews [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>In our study, we use abstracts of randomized clinical trials in rheumatology as a case example to evaluate the performance of openly available, generalist, zero-shot classification models in classifying types of interventions against a human gold standard. As a benchmark, we compare its performance with the lightest version of Llama 3 available at the time of the study (8 billion parameters).</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>We used an existing database of all primary reports of RCTs published in rheumatology between 2009 to 2022. Full details about search methodology can be found in a published report [<xref ref-type="bibr" rid="ref12">12</xref>] (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><sec id="s2-1"><title>Data Extraction</title><p>One reviewer with no previous experience in evidence synthesis or rheumatology (SC) extracted information of RCTs according to the type of intervention. A second reviewer with experience in evidence synthesis (DBG) and rheumatology reviewed the data classification of the first reviewer. Disagreements were resolved by a trained rheumatologist (MI). The classification obtained was considered the gold standard.</p><p>The type of intervention, pharmacological or nonpharmacological (<italic>drug or non-drug intervention</italic>) was extracted for each abstract. RCTs assessing nondrug interventions [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] were further categorized as: Behavioral, Biological treatments, Delivery of health care services, Device, Education, Exercise therapy, Food/plants/supplements, Procedure, Surgical, Wellness and spa, and Other (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The Geneva Research Ethics Committee exempted the present study from formal ethics review since it is based on publicly available data.</p></sec><sec id="s2-3"><title>Language Models</title><sec id="s2-3-1"><title>Zero-Shot Classifiers</title><p>RCTs were classified by type of intervention (&#x201C;<italic>drug&#x201D;</italic> or <italic>&#x201C;non-drug</italic>&#x201D;) using the two most popular zero-shot classifiers [<xref ref-type="bibr" rid="ref10">10</xref>] available at the HuggingFace platform [<xref ref-type="bibr" rid="ref15">15</xref>], namely a classifier based on the DeBERTa model by Microsoft [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], and another classifier developed by Meta (<italic>bart-large-mnli, based on BART</italic> [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]). Both models have a total size of less than half a billion parameters.</p><p>Zero-shot models only require labels to classify the text and provide a probability for each label (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Abstracts were assigned to the intervention with the highest probability. To assess whether the wording of the labels could affect the classification performance, we tested seven different label combinations: the first five had two labels (drug intervention/nondrug intervention, drug intervention/other, pharmacological treatment/ nonpharmacological treatment, pharmacological treatment/ other, drug/nondrug), and the last two had the details of the &#x201C;non-drug&#x201D; categories (Table S2-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Example of the outputs of the zero-shot classifiers (<bold>A</bold>) and of Llama3 8B LLM (<bold>B</bold>) when provided a rheumatology RCT abstract. RCT: randomized controlled trial.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e77943_fig01.png"/></fig></sec><sec id="s2-3-2"><title>Large Language Models</title><p>We used the 8B (8 Billion parameters) version of Meta Llama 3 [<xref ref-type="bibr" rid="ref20">20</xref>] quantized in 8bit, using a prompt with few-shot prompting with chain-of-thought method [<xref ref-type="bibr" rid="ref21">21</xref>]. The prompt specified a structured output to extract the answer, asking to provide first an explanation after an &#x201C;R:&#x201D; mark, and then the classification after an &#x201C;A:&#x201D; mark. Output not properly formatted or providing categories not listed in the initial prompt were considered as missing and were considered as wrong in all performance metrics. The parameters used are based on a previous study [<xref ref-type="bibr" rid="ref22">22</xref>]: a temperature of 0.3, cumulative probability and most likely next word sampling (top_p and top_k), with top_k=40 and top_p=0.95. Models were run on computer with an NVIDIA Titan X with 12 GB of VRAM, in conjunction with 20G of RAM and a standard CPU. Inference was performed using Python 3.10.4, numpy 1.22.3, pandas 1.4.2, and transformers 4.57.1. The data and prompts used are publicly available in a Gitlab repository [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec></sec><sec id="s2-4"><title>Data Analysis</title><p>Performance of each model and of the nontrained reviewer (SC) was assessed using four metrics:</p><list list-type="endash"><list-item><p>accuracy, defined as the sum of correct outcomes according to the gold standard divided by the total number of abstracts classified.</p></list-item><list-item><p>Predictive value for the study being about drugs (PVD), defined as the proportion of abstracts properly classified as drug intervention.</p></list-item><list-item><p>Predictive value for the study being about nondrugs (PVND), defined as the proportion of abstracts properly classified as nondrug intervention.</p></list-item><list-item><p>Macro F1, defined as the average of the F1 obtained for the classification of each label (drug and nondrug). F1 is the harmonic mean of the precision and recall.</p></list-item></list><p>For all the metrics, the denominator includes all abstracts, even those for which the algorithm provided badly formatted outputs. Data were summarized using frequencies and percentages for categorical variables and median and interquartile range for continuous variables. Model performances were compared to those of the first reviewer.</p><p>Confidence intervals were computed using the Wilson method [<xref ref-type="bibr" rid="ref24">24</xref>]. The analysis was done using R software (version 4.2.0; R Foundation for Statistical Computing) [<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Of the 1054 RCTs included, 452/1054 (42.9%) assessed drug interventions and 602/1054 (57.1%) nondrug interventions. Among nondrug interventions, the most common were exercise therapy (147/602, 24.4%), procedures (144/602, 23.9%), and delivery of care (88/602, 14.3%) (<xref ref-type="table" rid="table1">Table 1</xref>).</p><p>Zero-shot classifiers took 0.3 Gb of VRAM and around 10 seconds per abstract to run, while the quantized version of Llama3 8B took 10 Gb of VRAM, and 4 seconds per abstract.</p><p>The zero-shot classification DeBERTa-based model achieved the highest accuracy using &#x201C;drug&#x201D; or &#x201C;non-drug&#x201D; categories, with an accuracy of 88.1% (95% CI 86.0%&#x2010;90.0%), predictive value for the drug category (PVD) of 80.6% (77.0&#x2010;83.7%) and a predictive value for the nondrug category (PVND) of 96.0% (95% CI 93.9%&#x2010;97.3%) and a macro F1 of 88.1% (86.7%&#x2010;89.5%) (<xref ref-type="table" rid="table2">Table 2</xref>, supplementary Tables S4 and S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), but the accuracy varied substantially according to the label wording. The BART-based model was less affected by the wording of the labels. The highest accuracy was achieved with the &#x201C;drug intervention,&#x201D; &#x201C;non-drug intervention&#x201D; labels (prompt 1 in <xref ref-type="table" rid="table2">Table 2</xref>) with an accuracy of 86.9% (95% CI 84.7%&#x2010;88.8%), PVD 81.2% (95% CI 77.5%&#x2010;84.3%), PVND of 92.2% (95% CI 89.6%&#x2010;94.1%) and a macro F1 of 86.8% (85.3%&#x2010;88.2%) (<xref ref-type="table" rid="table2">Table 2</xref>, supplementary Tables S4 and S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The approaches based on Llama3 8B and few-shot prompting reached 90.8% (88.9%&#x2010;92.4%) accuracy with a prompt requiring a classification into 11 category and providing no context. Less than 2% of the output were not properly formatted and were considered as missing. The predictive values were 89.2% (86.0%&#x2010;91.7%) for the drug category and of 92.0% (89.6%&#x2010;93.9%) for the nondrug category, reaching a macro F1 of 90.6% (89.3%&#x2010;91.9%) (<xref ref-type="table" rid="table2">Table 2</xref>, supplementary Tables S4 and S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Adding context (prompt 1 vs 2, or prompt 3 vs 4) improved the performance of the LLM classification only for the two-category approach, where the confidence intervals of accuracy did not overlap (prompt 3: 89.3%; 87.3%&#x2010;91.0%) vs prompt 4: 85.5%; 83.2%&#x2010;87.5%). Using only two categories yielded a lower PVD but a higher PVND compared to 11-category prompts.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Interventions assessed in the 1054 rheumatology RCTs<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> included in this study.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Intervention</td><td align="left" valign="bottom">RCTs (N=1054), n (%)<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Drug interventions</td><td align="left" valign="top">452 (42.9)</td></tr><tr><td align="left" valign="top" colspan="2">Nondrug interventions</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Exercise therapy</td><td align="left" valign="top">147 (13.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Procedure</td><td align="left" valign="top">144 (13.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Delivery of health care services</td><td align="left" valign="top">88 (8.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Device</td><td align="left" valign="top">50 (4.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Food/plants/supplements</td><td align="left" valign="top">45 (4.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Education</td><td align="left" valign="top">31 (2.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Surgical</td><td align="left" valign="top">27 (2.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Wellness and spa</td><td align="left" valign="top">22 (2.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Behavioral</td><td align="left" valign="top">19 (1.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Biological treatments</td><td align="left" valign="top">16 (1.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other</td><td align="left" valign="top">13 (1.2)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>RCT: randomized controlled trials.</p></fn><fn id="table1fn2"><p><sup>b</sup>Note: Percentages may not total exactly 100 due to rounding</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance metrics of zero-shot classifiers and Llama3 8B in classifying RCT<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> abstract between &#x201C;drug&#x201D; and &#x201C;non-drug&#x201D; intervention, for various labeling strategies (for the zero-shot classifiers) and prompting approaches (for Llama3 8B, see supplementary table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Prompt/ Labeling strategy<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom">Accuracy<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="bottom">Predictive value drugs (PVD)<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>% [95% CI]</td><td align="left" valign="bottom">Predictive value nondrugs (PVND)<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup>% [95% CI]</td><td align="left" valign="bottom">Macro F1% [95% CI<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>]</td><td align="left" valign="bottom">Badly formatted output</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Zero-shot</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" rowspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeBERTa</td><td align="left" valign="top">1</td><td align="left" valign="top">82.5% [80.1&#x2010;84.7%]</td><td align="left" valign="top">71.5% [67.8&#x2010;74.9%]</td><td align="left" valign="top">98.6% [97.0&#x2010;99.4%]</td><td align="left" valign="top">82.5% [80.9&#x2010;84.2%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">70.3% [67.5&#x2010;73%]</td><td align="left" valign="top">59.1% [55.6&#x2010;62.6%]</td><td align="left" valign="top">99.3% [97.5&#x2010;99.8%]</td><td align="left" valign="top">69.6% [67.6&#x2010;71.6%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">84.3% [81.9&#x2010;86.3%]</td><td align="left" valign="top">73.6% [69.9&#x2010;77%]</td><td align="left" valign="top">98.7% [97.1&#x2010;99.4%]</td><td align="left" valign="top">84.3% [82.7&#x2010;85.8%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">78.4% [75.8&#x2010;80.7%]</td><td align="left" valign="top">66.6% [63.0&#x2010;70.1%]</td><td align="left" valign="top">99.2% [97.7&#x2010;99.7%]</td><td align="left" valign="top">78.3% [76.5&#x2010;80.0%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">88.1% [86.0&#x2010;90.0%]</td><td align="left" valign="top">80.6% [77.0&#x2010;83.7%]</td><td align="left" valign="top">96.0% [93.9&#x2010;97.3%]</td><td align="left" valign="top">88.1% [86.7&#x2010;89.5%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">82.4% [79.9&#x2010;84.5%]</td><td align="left" valign="top">72.4% [68.7&#x2010;75.8%]</td><td align="left" valign="top">95.2% [92.9&#x2010;96.8%]</td><td align="left" valign="top">82.4% [80.7&#x2010;84.0%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">69.6% [66.8&#x2010;72.3%]</td><td align="left" valign="top">81.1% [75.3&#x2010;85.8%]</td><td align="left" valign="top">66.7% [63.5&#x2010;69.8%]</td><td align="left" valign="top">64.8% [62.6&#x2010;67.0%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" rowspan="7">&#x2003;BART</td><td align="left" valign="top">1</td><td align="left" valign="top">86.9% [84.7&#x2010;88.8%]</td><td align="left" valign="top">81.2% [77.5&#x2010;84.3%]</td><td align="left" valign="top">92.2% [89.6&#x2010;94.1%]</td><td align="left" valign="top">86.8% [85.3&#x2010;88.2%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">85.4% [83.1&#x2010;87.4%]</td><td align="left" valign="top">76.3% [72.7&#x2010;79.6%]</td><td align="left" valign="top">95.9% [93.8&#x2010;97.3%]</td><td align="left" valign="top">85.4% [83.9&#x2010;86.9%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">86.3% [84.1&#x2010;88.3%]</td><td align="left" valign="top">82.6% [78.9&#x2010;85.8%]</td><td align="left" valign="top">89.3% [86.6&#x2010;91.6%]</td><td align="left" valign="top">86.1% [84.6&#x2010;87.6%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">86.1% [83.9&#x2010;88.1%]</td><td align="left" valign="top">78.5% [74.9&#x2010;81.8%]</td><td align="left" valign="top">94% [91.6&#x2010;95.8%]</td><td align="left" valign="top">86.1% [84.6&#x2010;87.6%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">84.9% [82.6&#x2010;86.9%]</td><td align="left" valign="top">82.9% [79.1&#x2010;86.1%]</td><td align="left" valign="top">86.4% [83.4&#x2010;88.9%]</td><td align="left" valign="top">84.6% [83.0&#x2010;86.1%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">86.1% [83.9&#x2010;88.1%]</td><td align="left" valign="top">79.8% [76.1&#x2010;83.0%]</td><td align="left" valign="top">92.2% [89.7&#x2010;94.2%]</td><td align="left" valign="top">86.1% [84.6&#x2010;87.5%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">77.8% [75.2&#x2010;80.2%]</td><td align="left" valign="top">83.6% [79.2&#x2010;87.3%]</td><td align="left" valign="top">75.2% [71.9&#x2010;78.2%]</td><td align="left" valign="top">76.1% [74.2&#x2010;78.0%]</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="5">Llama3 8B</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" rowspan="4">&#x2003;</td><td align="left" valign="top">1</td><td align="left" valign="top">89.5% [87.5&#x2010;91.2%]</td><td align="left" valign="top">88% [84.6&#x2010;90.7%]</td><td align="left" valign="top">90.6% [88.0&#x2010;92.7%]</td><td align="left" valign="top">89.2% [87.9&#x2010;90.6%]</td><td align="left" valign="top">3.1% [2.2&#x2010;4.4%]</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">90.8% [88.9&#x2010;92.4%]</td><td align="left" valign="top">89.2% [86.0&#x2010;91.7%]</td><td align="left" valign="top">92.0% [89.6&#x2010;93.9%]</td><td align="left" valign="top">90.6% [89.3&#x2010;91.9%]</td><td align="left" valign="top">1.9% [1.2&#x2010;2.9%]</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">89.3% [87.3&#x2010;91.0%]</td><td align="left" valign="top">82.2% [78.7&#x2010;85.2%]</td><td align="left" valign="top">96.4% [94.4&#x2010;97.7%]</td><td align="left" valign="top">89.2% [87.9&#x2010;90.6%]</td><td align="left" valign="top">0.3% [0.1&#x2010;0.8%]</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">85.5% [83.2&#x2010;87.5%]</td><td align="left" valign="top">79% [75.3&#x2010;82.3%]</td><td align="left" valign="top">91.7% [89.0&#x2010;93.7%]</td><td align="left" valign="top">85.4% [83.9&#x2010;86.9%]</td><td align="left" valign="top">5.1% [3.9&#x2010;6.6%]</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>RCT: randomized controlled trial.</p></fn><fn id="table2fn2"><p><sup>b</sup>Detailed information about the labels used are in supplementary Table S2, and in supplementary table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for prompts.</p></fn><fn id="table2fn3"><p><sup>c</sup>Sum of correct outcomes according to the gold standard divided by the total number of abstracts classified.</p></fn><fn id="table2fn4"><p><sup>d</sup>Proportion of abstracts properly classified as drug intervention.</p></fn><fn id="table2fn5"><p><sup>e</sup>Proportion of abstracts properly classified as nondrug intervention.</p></fn><fn id="table2fn6"><p><sup>f</sup>Average of F1 for each label. F1 is the harmonic mean of precision and recall.</p></fn></table-wrap-foot></table-wrap><p>The nontrained reviewer reached 85.9% accuracy (95% CI: 83.6%&#x2010;87.8%), with a PVD lower than those obtained with AI models (75.6%; 95% CI: 72.0%&#x2010;78.9%) but a higher PVND (99.1%; 95% CI 97.8%&#x2010;99.7%). When looking specifically at nondrug interventions, both the nontrained reviewer and BART had suboptimal performance for trials involving procedures (eg, intra-articular injections), food compounds, vitamins, supplements, or biological treatments. These misclassifications were not resolved by using explicit labeling of all nondrug categories (labeling strategies 6 and 7, see supplementary table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Llama3 8B, aided by detailed prompts, properly classified food/supplements as nondrug and performed better in general at classifying drug interventions (<xref ref-type="fig" rid="figure2">Figure 2</xref>, supplementary table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Performance metrics when classifying abstracts between &#x201C;drug&#x201D; or &#x201C;non-drug&#x201D; interventions, for the untrained reviewer (left column), the BART-based zero-shot classifier (middle column) and Llama3 8B model (right column).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e77943_fig02.png"/></fig><p>When testing the zero-shot classifiers on the eleven categories of nondrug intervention, the classifier performed globally poorly with discrepant results between categories, with some, such as &#x201C;surgical,&#x201D; reaching as high as 92.6% (95% CI 76.6%&#x2010;97.9%) accuracy, when others such as &#x201C;wellness&#x201D; or &#x201C;device&#x201D; had a zero or close to zero accuracy (see Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for full details).</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our study found that zero-shot classifier models can reach 85%&#x2010;90% accuracy on a simple binary classification task and 85% macro F1, despite not being trained on medical or rheumatology-specific data. Meta&#x2019;s BART model was the least sensitive to changes in label phrasing and produced the best overall performance when using simple two-category labels. Its accuracy was comparable to that of an untrained human reviewer and slightly lower than that of Llama 3 8B. Given its ease of use and small computational footprint, BART is a practical option for abstract classification tasks in resource-limited settings.</p><p>Both zero-shot classification models struggled to classify RCTs testing procedures, food/supplements, or biological treatment as drugs. These interventions resemble drug-based treatments (eg, supplements, biologics) or involve drugs administered through procedures, yet differ by requiring specialized expert actions (eg, injections, ultrasound guidance). This nuance created difficulties for both the zero-shot models and the untrained reviewer. Adjusting label phrasing did not resolve these errors, likely because zero-shot models cannot use detailed task instructions. For Llama3 8B, the effect of prompt design was more nuanced: providing context improved accuracy only in the two-category setting, while the number of categories influenced the balance between predictive values for drugs and nondrugs. It is expected that more advanced LLMs may reduce such misclassifications when provided with explicit explanatory prompts. Zero-shot classifiers were unable to reliably handle multi-label or more complex classification tasks, for which high-performance LLMs remain more suitable.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>Our results align with studies evaluating RCT abstracts using trained machine learning models [<xref ref-type="bibr" rid="ref26">26</xref>], or prompt-based LLMs [<xref ref-type="bibr" rid="ref27">27</xref>], which suggest these methods could supplement a reviewer for simple classification tasks. Recent work [<xref ref-type="bibr" rid="ref28">28</xref>] shows that bigger models (Llama3 70B) can reach 95% accuracy on the same classification task, albeit at substantially greater computational cost, and that aggregating outputs from multiple LLMs can exceed human-level performance. Nevertheless, expert reviewers remain essential to supervise the process and prevent inappropriate exclusion or misclassification of studies [<xref ref-type="bibr" rid="ref29">29</xref>].</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>This study analyzed over 1000 rheumatology RCT abstracts, enabling precise estimates of model performance. It also examined the details of the subtypes of nondrug interventions, to allow a good understanding of where humans and LLM models misclassify. The zero-shot models presented are free to use, available and stable for more than three years, running on standard computers. Besides, we provide the prompts and data used in our study, enabling replication of our methods in different medical fields.</p><p>Several limitations should be noted. First, we evaluated only a single, simple binary classification task within rheumatology. Thus, the present results may not generalize to other fields or to multi-label tasks. Second, LLMs are prone to a rapid development and better performance could be obtained using more recent models. Finally, the reference human performance is based on a single untrained reviewer, which may not reflect broader human variability, though the observed error rate aligns with typical rates reported for systematic review screening [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec><sec id="s4-4"><title>Conclusion</title><p>Zero-shot classification models can classify with accuracy similar to a scientist untrained in evidence synthesis, perform well on simple classification tasks, while not requiring specific training for the scientific domain or high-performance computing. These tools have the potential to facilitate classification processes of systematic reviews or bibliometric studies by replacing one reviewer for simple classification tasks.</p></sec></sec></body><back><ack><p>Generative AI tools were not used for the present study.</p></ack><notes><sec><title>Funding</title><p>This project was funded by the Swiss National Science Foundation (Grant number 212393).</p></sec><sec><title>Data Availability</title><p>The data that supports the findings of this study is publicly available in a Gitlab repository [<xref ref-type="bibr" rid="ref22">22</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>DB-G: Data curation, Investigation, Validation, Visualization, writing &#x2013; original draft</p><p>DSC: Validation, Writing &#x2013; review &#x0026; editing</p><p>SC: Data curation, Investigation, Writing &#x2013; review &#x0026; editing</p><p>MI: Funding acquisition, Writing &#x2013; review &#x0026; editing</p><p>DM: Conceptualization, Data curation, Formal analysis, Funding acquisition, Methodology, Investigation, Software, Supervision, Validation, Visualization, Writing-original draft.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">PVD</term><def><p>predictive value for the study being about drugs</p></def></def-item><def-item><term id="abb5">PVND</term><def><p>predictive value for the study being about nondrugs</p></def></def-item><def-item><term id="abb6">RCT</term><def><p>randomized controlled trial</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Donthu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mukherjee</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>WM</given-names> </name></person-group><article-title>How to conduct a bibliometric analysis: an overview and guidelines</article-title><source>J Bus Res</source><year>2021</year><month>09</month><volume>133</volume><fpage>285</fpage><lpage>296</lpage><pub-id pub-id-type="doi">10.1016/j.jbusres.2021.04.070</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><article-title>The value of evidence synthesis</article-title><source>Nat Hum Behav</source><year>2021</year><month>05</month><day>1</day><volume>5</volume><issue>5</issue><fpage>539</fpage><lpage>539</lpage><pub-id pub-id-type="doi">10.1038/s41562-021-01131-7</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><source>Methodological Expectations of Cochrane Intervention Reviews (MECIR) Standards for the conduct and reporting of | Policy Commons</source><access-date>2024-08-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://policycommons.net/artifacts/1712743/methodological-expectations-of-cochrane-intervention-reviews-mecir-standards-for-the-conduct-and-reporting-of/2444392/">https://policycommons.net/artifacts/1712743/methodological-expectations-of-cochrane-intervention-reviews-mecir-standards-for-the-conduct-and-reporting-of/2444392/</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gartlehner</surname><given-names>G</given-names> </name><name name-style="western"><surname>Affengruber</surname><given-names>L</given-names> </name><name name-style="western"><surname>Titscher</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Single-reviewer abstract screening missed 13 percent of relevant studies: a crowd-based, randomized controlled trial</article-title><source>J Clin Epidemiol</source><year>2020</year><month>05</month><volume>121</volume><fpage>20</fpage><lpage>28</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.01.005</pub-id><pub-id pub-id-type="medline">31972274</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Borah</surname><given-names>R</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Capers</surname><given-names>PL</given-names> </name><name name-style="western"><surname>Kaiser</surname><given-names>KA</given-names> </name></person-group><article-title>Analysis of the time and workers needed to conduct systematic reviews of medical interventions using data from the PROSPERO registry</article-title><source>BMJ Open</source><year>2017</year><month>02</month><volume>7</volume><issue>2</issue><fpage>e012545</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2016-012545</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Affengruber</surname><given-names>L</given-names> </name><name name-style="western"><surname>van der Maten</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Spiero</surname><given-names>I</given-names> </name><etal/></person-group><article-title>An exploration of available methods and tools to improve the efficiency of systematic review production: a scoping review</article-title><source>BMC Med Res Methodol</source><year>2024</year><month>09</month><day>18</day><volume>24</volume><issue>1</issue><pub-id pub-id-type="doi">10.1186/s12874-024-02320-4</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Hersh</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Peterson</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yen</surname><given-names>PY</given-names> </name></person-group><article-title>Reducing Workload in Systematic Review Preparation Using Automated Citation Classification</article-title><source>J Am Med Inform Assoc</source><year>2006</year><month>03</month><day>1</day><volume>13</volume><issue>2</issue><fpage>206</fpage><lpage>219</lpage><pub-id pub-id-type="doi">10.1197/jamia.M1929</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>M</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Park</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Paget</surname><given-names>M</given-names> </name><name name-style="western"><surname>Naugler</surname><given-names>C</given-names> </name></person-group><article-title>Automated Paper Screening for Clinical Reviews Using Large Language Models: Data Analysis Study</article-title><source>J Med Internet Res</source><year>2024</year><month>01</month><day>12</day><volume>26</volume><fpage>e48996</fpage><pub-id pub-id-type="doi">10.2196/48996</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aum</surname><given-names>S</given-names> </name><name name-style="western"><surname>Choe</surname><given-names>S</given-names> </name></person-group><article-title>srBERT: automatic article classification model for systematic review using BERT</article-title><source>Syst Rev</source><year>2021</year><month>12</month><volume>10</volume><issue>1</issue><pub-id pub-id-type="doi">10.1186/s13643-021-01763-w</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yin</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hay</surname><given-names>J</given-names> </name><name name-style="western"><surname>Roth</surname><given-names>D</given-names> </name></person-group><article-title>Benchmarking zero-shot text classification: datasets, evaluation and entailment approach</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 31, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1909.00161</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Moreno-Garc&#x00ED;a</surname><given-names>CF</given-names> </name><name name-style="western"><surname>Jayne</surname><given-names>C</given-names> </name><name name-style="western"><surname>Elyan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Aceves-Martins</surname><given-names>M</given-names> </name></person-group><article-title>Abstract screening for systematic reviews using machine learning and zero-shot classification</article-title><source>SSRN</source><comment>Preprint posted online on  Sep 12, 2022</comment><pub-id pub-id-type="doi">10.2139/ssrn.4210704</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mongin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Buitrago-Garcia</surname><given-names>D</given-names> </name><name name-style="western"><surname>Capderou</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Prospective registration of trials: where we are, why, and how we could get better</article-title><source>J Clin Epidemiol</source><year>2024</year><month>12</month><volume>176</volume><fpage>111586</fpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2024.111586</pub-id><pub-id pub-id-type="medline">39481460</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Gelijns</surname><given-names>AC</given-names> </name><collab>Institute of Medicine (US) Committee on Technological Innovation in Medicine</collab></person-group><article-title>4. the development of clinical procedures</article-title><source>Technological Innovation: Comparing Development of Drugs, Devices, and Procedures in Medicine</source><year>1986</year><access-date>2025-01-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK222716">https://www.ncbi.nlm.nih.gov/books/NBK222716</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ninot</surname><given-names>G</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ninot</surname><given-names>G</given-names> </name></person-group><article-title>Defining non-pharmacological interventions (npis)</article-title><source>Non-Pharmacological Interventions: An Essential Answer to Current Demographic, Health, and Environmental Transitions</source><year>2021</year><publisher-name>Springer International Publishing</publisher-name><fpage>1</fpage><lpage>46</lpage><pub-id pub-id-type="other">978-3-030-60971-9</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>Zero-shot classification models</article-title><source>HuggingFace</source><year>2023</year><access-date>2025-11-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/models">https://huggingface.co/models</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>P</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name></person-group><article-title>DeBERTa: decoding-enhanced BERT with disentangled attention</article-title><source>arXiv</source><comment>Preprint posted online on 2021</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2006.03654">http://arxiv.org/abs/2006.03654</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>MoritzLaurer/deberta-v3-base-mnli-fever-anli</article-title><source>HuggingFace</source><year>2023</year><access-date>2025-11-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli">https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Bart-large model page</article-title><source>HuggingFace</source><year>2024</year><access-date>2025-05-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/facebook/bart-large-mnli">https://huggingface.co/facebook/bart-large-mnli</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension</article-title><source>arXiv</source><comment>Preprint posted online on 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.13461</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>AI@Meta</collab></person-group><article-title>Llama 3 model card</article-title><source>github</source><year>2024</year><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md">https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sahoo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>V</given-names> </name><name name-style="western"><surname>Mondal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chadha</surname><given-names>A</given-names> </name></person-group><article-title>A systematic survey of prompt engineering in large language models: techniques and applications</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 5, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.07927</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>B&#x00FC;rgisser</surname><given-names>N</given-names> </name><name name-style="western"><surname>Chalot</surname><given-names>E</given-names> </name><name name-style="western"><surname>Mehouachi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Large language models for accurate disease detection in electronic health records: the examples of crystal arthropathies</article-title><source>RMD Open</source><year>2024</year><month>12</month><volume>10</volume><issue>4</issue><fpage>e005003</fpage><pub-id pub-id-type="doi">10.1136/rmdopen-2024-005003</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Gitlab repository</collab></person-group><source>GitLab</source><year>2024</year><access-date>2025-05-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://gitlab-b.unige.ch/trial_integrity/ai_intervention_categorisation_public">https://gitlab-b.unige.ch/trial_integrity/ai_intervention_categorisation_public</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wilson</surname><given-names>EB</given-names> </name></person-group><article-title>Probable Inference, the Law of Succession, and Statistical Inference</article-title><source>J Am Stat Assoc</source><year>1927</year><month>06</month><volume>22</volume><issue>158</issue><fpage>209</fpage><lpage>212</lpage><pub-id pub-id-type="doi">10.1080/01621459.1927.10502953</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>R Core Team</collab></person-group><source>R: A Language and Environment for Statistical Computing</source><year>2019</year><publisher-name>R Foundation for Statistical Computing</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.R-project.org">https://www.R-project.org</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Windisch</surname><given-names>P</given-names> </name><name name-style="western"><surname>Dennst&#x00E4;dt</surname><given-names>F</given-names> </name><name name-style="western"><surname>Koechli</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Metastatic versus localized disease as inclusion criteria that can be automatically extracted from randomized controlled trials using natural language processing</article-title><source>JCO Clin Cancer Inform</source><year>2024</year><month>12</month><issue>8</issue><pub-id pub-id-type="doi">10.1200/CCI-24-00150</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Syriani</surname><given-names>E</given-names> </name><name name-style="western"><surname>David</surname><given-names>I</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>G</given-names> </name></person-group><article-title>Screening articles for systematic reviews with ChatGPT</article-title><source>Journal of Computer Languages</source><year>2024</year><month>08</month><volume>80</volume><fpage>101287</fpage><pub-id pub-id-type="doi">10.1016/j.cola.2024.101287</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Courvoisier</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Buitrago-Garcia</surname><given-names>D</given-names> </name><name name-style="western"><surname>Buclin</surname><given-names>CP</given-names> </name><name name-style="western"><surname>B&#x00FC;rgisser</surname><given-names>N</given-names> </name><name name-style="western"><surname>Iudici</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mongin</surname><given-names>D</given-names> </name></person-group><article-title>Beyond human gold standards: A multimodel framework for automated abstract classification and information extraction</article-title><source>Res Synth Methods</source><year>2026</year><month>03</month><volume>17</volume><issue>2</issue><fpage>365</fpage><lpage>377</lpage><pub-id pub-id-type="doi">10.1017/rsm.2025.10054</pub-id><pub-id pub-id-type="medline">41635942</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qureshi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shaughnessy</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gill</surname><given-names>KAR</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Li</surname><given-names>T</given-names> </name><name name-style="western"><surname>Agai</surname><given-names>E</given-names> </name></person-group><article-title>Are ChatGPT and large language models &#x201C;the answer&#x201D; to bringing us closer to systematic review automation?</article-title><source>Syst Rev</source><year>2023</year><month>04</month><day>29</day><volume>12</volume><issue>1</issue><fpage>72</fpage><pub-id pub-id-type="doi">10.1186/s13643-023-02243-z</pub-id><pub-id pub-id-type="medline">37120563</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Nayfeh</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tetzlaff</surname><given-names>J</given-names> </name><name name-style="western"><surname>O&#x2019;Blenis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Murad</surname><given-names>MH</given-names> </name></person-group><article-title>Error rates of human reviewers during abstract screening in systematic reviews</article-title><source>PLoS ONE</source><year>2020</year><month>01</month><day>14</day><volume>15</volume><issue>1</issue><fpage>e0227742</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0227742</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary materials.</p><media xlink:href="medinform_v14i1e77943_app1.docx" xlink:title="DOCX File, 65 KB"/></supplementary-material></app-group></back></article>