<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e72657</article-id><article-id pub-id-type="doi">10.2196/72657</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Classification of Cochrane Plain Language Summaries by Conclusiveness Using Transformer-Based Models and ChatGPT: Retrospective Observational Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Mijatovi&#x0107;</surname><given-names>Antonija</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ursi&#x0107;</surname><given-names>Luka</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Brali&#x0107;</surname><given-names>Nensi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bandi&#x0107;</surname><given-names>Ru&#x017E;ica</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>&#x0106;a&#x0107;i&#x0107;</surname><given-names>Barbara</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Buljan</surname><given-names>Ivan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Maru&#x0161;i&#x0107;</surname><given-names>Ana</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Research in Biomedicine and Health, Centre for Evidence-based Medicine, University of Split School of Medicine</institution><addr-line>&#x0160;oltanska 2A</addr-line><addr-line>Split</addr-line><country>Croatia</country></aff><aff id="aff2"><institution>Faculty of Humanities and Social Sciences, University of Split</institution><addr-line>Split</addr-line><country>Croatia</country></aff><aff id="aff3"><institution>Department of Psychology, Faculty of Humanities and Social Sciences, University of Split</institution><addr-line>Split</addr-line><country>Croatia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Castonguay</surname><given-names>Alexandre</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gaol</surname><given-names>Ford Lumban</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Homayouni</surname><given-names>Ramin</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Antonija Mijatovi&#x0107;, PhD, Department of Research in Biomedicine and Health, Centre for Evidence-based Medicine, University of Split School of Medicine, &#x0160;oltanska 2A, Split, 21000, Croatia, 385 21557820; <email>antonija.mijatovic@mefst.hr</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>14</day><month>4</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e72657</elocation-id><history><date date-type="received"><day>14</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>23</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>23</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Antonija Mijatovi&#x0107;, Luka Ursi&#x0107;, Nensi Brali&#x0107;, Ru&#x017E;ica Bandi&#x0107;, Barbara &#x0106;a&#x0107;i&#x0107;, Ivan Buljan, Ana Maru&#x0161;i&#x0107;. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 14.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e72657"/><abstract><sec><title>Background</title><p>Cochrane plain language summaries (PLSs) aim to make systematic review findings more accessible to the general public. However, inconsistencies in how conclusions are presented may impact comprehension and decision-making. Classifying PLSs based on conclusiveness can improve clarity and facilitate informed health decisions.</p></sec><sec><title>Objective</title><p>This study aimed to develop and evaluate deep learning language models for the classification of PLSs according to 3 levels of conclusiveness (conclusive, inconclusive, and unclear) and to compare their performance with a general-purpose large language model (GPT-4o).</p></sec><sec sec-type="methods"><title>Methods</title><p>We used a publicly available dataset containing 4405 Cochrane PLSs of systematic reviews published until 2019, already classified by humans according to 9 categories of conclusiveness regarding the intervention&#x2019;s effectiveness or safety. We merged these categories into 3 classes based on the strength of conclusiveness: conclusive, inconclusive, and unclear. For the fine-tuning, we used Scientific Bidirectional Encoder Representations from Transformers (SciBERT), a pretrained language model trained on 1.14 million papers primarily from the health sciences, and Longformer, a transformer model designed specifically to process long documents. The script was developed using the Python programming language and the PyTorch framework. We computed evaluation metrics using the <italic>scikit-learn</italic> machine learning library and determined the area under the curve of the receiver operating characteristic (AUCROC) to measure the model performance in balancing sensitivity and specificity. We also analyzed a separate set of 213 PLSs and compared the predictions of our pretrained models with both manual verification and outputs generated by ChatGPT.</p></sec><sec sec-type="results"><title>Results</title><p>The model based on SciBERT achieved a balanced accuracy of 56.6%. The AUCROC was 0.91 for &#x201C;conclusive,&#x201D; 0.67 for &#x201C;inconclusive,&#x201D; and 0.75 for &#x201C;unclear&#x201D; conclusiveness classes. The Longformer-based model had a balanced accuracy of 60.9%, with AUCROCs of 0.86 for &#x201C;conclusive,&#x201D; 0.67 for &#x201C;inconclusive,&#x201D; and 0.72 for &#x201C;unclear&#x201D; conclusiveness classes. Both models underperformed compared with ChatGPT, which demonstrated higher accuracy (74.2%), better precision and recall, and a higher Cohen &#x03BA; (0.57).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Fine-tuning 2 transformer-based language models showed mixed results in classifying Cochrane PLSs by conclusiveness, likely due to semantic overlap and subtle linguistic differences. Despite satisfactory internal test metrics, the fine-tuned models failed to generalize to newly published PLSs, where performance dropped to near-chance levels. These findings suggest that general-purpose large language models like GPT-4o may currently offer more reliable results for practical classification tasks in biomedical applications.</p></sec></abstract><kwd-group><kwd>plain language summary</kwd><kwd>PLS</kwd><kwd>large language models</kwd><kwd>Scientific Bidirectional Encoder Representations from Transformers</kwd><kwd>SciBERT</kwd><kwd>Longformer</kwd><kwd>fine-tuning</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>A Cochrane plain language summary (PLS) is a stand-alone summary of a Cochrane systematic review used to disseminate health-related evidence to a wider audience with the goal of facilitating evidence-based decision-making about health care, particularly for medical treatments [<xref ref-type="bibr" rid="ref1">1</xref>]. A well-written PLS should be comprehensible to readers without a background in research or health care, including patients, caregivers, and policymakers [<xref ref-type="bibr" rid="ref2">2</xref>], and should be presented at or below a sixth-grade reading level to ensure accessibility for all readers [<xref ref-type="bibr" rid="ref3">3</xref>]. It should also allow readers to comprehend the certainty of evidence and to correctly interpret the results, which is why the authors should not offer specific recommendations but rather present their findings clearly and guide the readers toward independent conclusions [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, a PLS necessarily has a conclusion section conveying the main message, where the level of certainty of the evidence is presented using narrative statements [<xref ref-type="bibr" rid="ref2">2</xref>]. For example, &#x201C;Intervention causes a large reduction/increase in outcome&#x201D; is a suggested narrative for large effect size and high certainty of the evidence, whereas &#x201C;It is unclear if intervention has an effect on outcome&#x201D; should indicate very low certainty [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Conclusiveness is an important concept in research and health care, indicating a degree of confidence in the findings and facilitating decision-making [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], while also ensuring that the current evidence is easily understood [<xref ref-type="bibr" rid="ref10">10</xref>]. When provided with conclusive health information, patients rely less on health care professionals to decide on diagnosis and therapy [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Unfortunately, they do not always succeed in finding relevant information for their health condition, as shown in a study where a quarter of respondents did not find answers to the health-related inquiries they raised on the internet [<xref ref-type="bibr" rid="ref13">13</xref>]. Patients also often find information in unreliable sources, providing misleading or false data on diagnostics and treatment [<xref ref-type="bibr" rid="ref14">14</xref>]. In addition, numerous studies have reported higher levels of anxiety and cyberchondria with increases in online health information searches [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Given that Cochrane Reviews represent the best available knowledge in the field [<xref ref-type="bibr" rid="ref1">1</xref>], the accurate classification of their PLSs could significantly improve patients&#x2019; comprehension of these conclusions and allow them to make well-informed decisions about health care interventions.</p><p>Several studies and reviews on the conclusiveness of Cochrane PLSs found that 50% to 80% of the reviews enabled readers to reach a relevant conclusion, while the readability of PLSs was generally poor, with conclusions often unclear or missing [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. However, in some cases, conclusive statements were made even when the quality of evidence was low or moderate [<xref ref-type="bibr" rid="ref23">23</xref>]. Our previous research showed that most PLSs lacked or had unclear conclusions regarding an intervention&#x2019;s efficacy and safety [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>In all of the studies on the conclusiveness of systematic reviews, the process of classifying the reviews and PLSs was carried out manually, usually by at least 2 independent assessors, which is a demanding and time-consuming task. In this study, we explored whether the classification of PLSs according to their level of conclusiveness could be conducted with the help of artificial intelligence (AI) and natural language processing (NLP). NLP algorithms, particularly deep learning models like neural networks, can automatically learn and extract patterns from language data [<xref ref-type="bibr" rid="ref25">25</xref>], enabling them to understand context and semantics [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Their multilingual variants continue to expand and become accessible to speakers of less-represented languages [<xref ref-type="bibr" rid="ref28">28</xref>] while domain-specific models lead to greater accuracy [<xref ref-type="bibr" rid="ref29">29</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>In this retrospective observational study with a supervised machine learning approach, we used a pretrained deep large language model (LLM) for PLS classification according to 3 levels of conclusiveness: conclusive, inconclusive, and unclear. We fine-tuned 2 transformer-based models&#x2014;Scientific Bidirectional Encoder Representations from Transformers (SciBERT) and Longformer&#x2014;for our task. SciBERT is a variant of the Bidirectional Encoder Representations from Transformers (BERT) model specifically designed for scientific and biomedical text processing that is pretrained on a vast corpus of scientific literature consisting of 18% of papers from computer science and 82% from the biomedical field [<xref ref-type="bibr" rid="ref30">30</xref>]. Longformer is a transformer architecture optimized for processing long documents through sparse attention mechanisms [<xref ref-type="bibr" rid="ref31">31</xref>]. SciBERT was selected to leverage the domain-specific language of PLSs, whereas Longformer was selected to accommodate PLSs with extended length. Specifically, the median number of words in a PLS is 345 [<xref ref-type="bibr" rid="ref32">32</xref>], which corresponds to approximately 500 tokens, just under SciBERT&#x2019;s 512-token limit. However, the Longformer model&#x2019;s extended token capacity of up to 4096 tokens allows for the processing of all PLSs without truncation.</p></sec><sec id="s2-2"><title>Data Source</title><p>We used the dataset from our previous study [<xref ref-type="bibr" rid="ref24">24</xref>], which contains 4405 Cochrane PLSs of systematic reviews on intervention studies published until 2019, already classified by 2 independent experts into 9 categories based on the conclusiveness regarding an intervention&#x2019;s effectiveness or safety. We combined these categories into 3 distinct classes: conclusive, inconclusive, and unclear, allowing for a more manageable and interpretable classification task (<xref ref-type="other" rid="box1">Textbox 1</xref>).</p><boxed-text id="box1"><title> Classification of conclusiveness categories.</title><p><bold>Conclusive</bold></p><list list-type="bullet"><list-item><p>Positive: signifies the existence of moderate- or high-quality evidence supporting the effectiveness or safety</p></list-item><list-item><p>Negative: indicates the presence of moderate- or high-quality evidence of intervention&#x2019;s ineffectiveness or harm</p></list-item><list-item><p>Equal: denotes that the analyzed interventions were of equal effectiveness or safety</p></list-item></list><p><bold>Inconclusive</bold></p><list list-type="bullet"><list-item><p>Positive inconclusive: implies the existence of evidence supporting effectiveness or safety, yet the evidence is low quality or inconclusive. The authors suggest that more research is needed.</p></list-item><list-item><p>Negative inconclusive: suggests there is evidence of ineffectiveness or harm (indicating that the observed effect or the intervention was unsafe), yet the evidence is low quality or inconclusive. Authors may advise against the intervention or comparison and state that more research is required.</p></list-item><list-item><p>Equal inconclusive: indicates that the interventions exhibit comparable levels of effectiveness or safety, yet the evidence is low quality or inconclusive. The authors emphasize that more research is required.</p></list-item></list><p><bold>Unclear</bold></p><list list-type="bullet"><list-item><p>No opinion: the authors provided no opinion.</p></list-item><list-item><p>No evidence: there is no evidence from randomized controlled trials because the literature search did not result in any eligible studies (ie, empty reviews).</p></list-item><list-item><p>Unclear: the authors did not present clear conclusions.</p></list-item></list></boxed-text><p>With the classification, the evidence in the &#x201C;conclusive&#x201D; class is strong and clear, irrespective of the direction of the effect, as opposed to the &#x201C;inconclusive&#x201D; class, where it is uncertain or of lower quality. In the &#x201C;unclear&#x201D; class, conclusions are absent, either because authors have not provided a clear opinion or due to a lack of available evidence. This lack of conclusiveness is not indicative of a poorly written PLS, as long as the PLS accurately represents the findings from the systematic review. This is why we must differentiate between PLSs that conclude that there is &#x201C;no evidence&#x201D; and those that offer no opinion or present unclear conclusions [<xref ref-type="bibr" rid="ref24">24</xref>].</p></sec><sec id="s2-3"><title>Language Processing Models</title><p>LLMs are important components of NLP designed to understand and generate human language. LLMs such as GPT-3 and BERT are pretrained on massive datasets containing text from the internet [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. Most importantly, they are highly adaptable, meaning they can be fine-tuned for specific tasks [<xref ref-type="bibr" rid="ref35">35</xref>]. For example, Beltagy et al [<xref ref-type="bibr" rid="ref30">30</xref>] fine-tuned BERT, an LLM that had been pretrained on a wide range of text on the internet [<xref ref-type="bibr" rid="ref33">33</xref>], to develop SciBERT, an LLM trained on a vast corpus of scientific literature, primarily from biomedical and life sciences, making it suitable for NLP tasks in the scientific and medical research domains. Similarly, Longformer was developed to address the limitations of handling long documents; it uses a sparse attention mechanism, where each token focuses on a limited local context rather than the entire sentence [<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>We achieved transfer learning by further fine-tuning SciBERT and Longformer on our specific PLS classification task. In transfer learning, the LLM adapts its learned features to the nuances of the new task while retaining the knowledge it acquired during pretraining [<xref ref-type="bibr" rid="ref36">36</xref>]. This approach is intended to mirror how humans learn, as we often apply knowledge and skills acquired in one context to solve new, related problems [<xref ref-type="bibr" rid="ref37">37</xref>]. Transfer learning not only speeds up the training process but also leads to better performance compared with training from scratch [<xref ref-type="bibr" rid="ref38">38</xref>].</p></sec><sec id="s2-4"><title>Experimental Setup and Fine-Tuning</title><p>We wrote the script using the Python programming language (version 3.12.3; Python Software Foundation) with the help of the PyTorch framework [<xref ref-type="bibr" rid="ref39">39</xref>] and executed it within the Jupyter Notebook environment [<xref ref-type="bibr" rid="ref40">40</xref>] using the NVIDIA GeForce RTX 3080 GPU (version 8200).</p><p>Both of our models came from the Hugging Face library [<xref ref-type="bibr" rid="ref41">41</xref>]. For SciBERT, we used its associated tokenizer, setting the maximum token length to 512. For Longformer, we extended the maximum token length to 2048 to accommodate the full content of the PLSs without truncation. Both models included a dropout layer with a rate of 0.5, a regularization technique that reduces the risk of overfitting.</p><p>For both models, we used a 768-dimensional pooled embedding vector as input to our classifier. This representation was passed through a dropout layer and a linear layer that produced a 3D output corresponding to our target classes. SciBERT used its built-in pooled ([CLS] token) representation, whereas Longformer used mean pooling across all token embeddings due to the absence of a pooler layer. A standard attention mask was applied during encoding, ensuring that padding tokens were fully excluded from self-attention computations. We used AdamW as the optimizer to update network weights [<xref ref-type="bibr" rid="ref42">42</xref>] and categorical cross entropy as our loss function [<xref ref-type="bibr" rid="ref43">43</xref>]. We set the maximum number of training epochs (where 1 epoch represents a complete pass through the training dataset) to 15 for SciBERT and 10 for Longformer, with early stopping based on validation loss. In practice, SciBERT training stopped after 7 epochs due to early stopping. The learning rate was set to 2&#x00D7;10&#x207B;<sup>5</sup> for both the models. Since these hyperparameters cannot be determined a priori, they were selected via trial and error [<xref ref-type="bibr" rid="ref44">44</xref>]. For this reason, we monitored training and validation performance and used early stopping based on validation loss to prevent overfitting. The best-performing models (lowest validation loss) were saved and later used for evaluation.</p><p>Baseline models (SciBERT and Longformer) were implemented using a frozen feature-extraction transfer-learning approach. All pretrained transformer encoder weights were frozen, and only the newly added linear classification layer was trainable. The encoder itself did not undergo any gradient updates. Therefore, this baseline represents a lightweight transfer-learning model.</p></sec><sec id="s2-5"><title>Data Splitting and Handling of Class Imbalance</title><p>We divided the dataset into training (80%), testing (10%), and validation (10%) subsets. Random undersampling was applied only to the training set, where the smallest class contained 343 PLSs. All classes were downsampled to this size to create a balanced training set.</p></sec><sec id="s2-6"><title>Model Validation</title><p>To assess performance, we used functions from <italic>scikit-learn</italic> [<xref ref-type="bibr" rid="ref45">45</xref>], including <italic>balanced_accuracy_score</italic>, which calculates balanced accuracy for addressing imbalances in multiclass datasets, and <italic>precision_recall_fscore_support</italic>, which provides precision, recall, and F-beta scores for each class. Precision is the proportion of true positives relative to the total of true positives and false positives. Recall is the proportion of true positives relative to the total of true positives and false negatives. The <italic>F</italic><sub>1</sub>-score is the harmonic mean of the precision and recall. Additionally, we evaluated the model&#x2019;s ability to balance sensitivity and specificity by measuring the area under the curve of the receiver operating characteristic (AUCROC), which is the proportion of area below the receiver operating characteristic curve, which in turn is the plot of the true positive rate against the false positive rate.</p></sec><sec id="s2-7"><title>Effect of Training and Validation Split on Model Performance</title><p>To evaluate the impact of different training and validation splits on model performance, we evaluated SciBERT&#x2019;s performance with 10%, 20%, and 30% of the data reserved for validation. This analysis was conducted to assess the relationship between training set size and classification accuracy, based on the assumption that larger training sets may improve model learning. By progressively reducing the number of training samples, we tested the extent to which performance would degrade with less training data.</p></sec><sec id="s2-8"><title>Manual Validation and Comparison With GPT-4o Performance</title><p>To evaluate model performance, we used a separate verification dataset consisting of 213 Cochrane PLSs published between September 2024 and May 2025, which was not a part of the original training or evaluation datasets. Each PLS was independently classified by 2 domain experts, with a third expert resolving any discrepancies (NB, a metascientist with expertise in Cochrane PLSs and conclusiveness assessment; RB, a research assistant at the Department of Research in Biomedicine and Health; and B&#x0106;, a psychology student with experience using AI tools to assess compliance with reporting guidelines).</p><p>The GPT-4o model was prompted using a zero-shot classification approach, which means no example classifications were provided in the prompt. The prompt instructed the model to classify each PLS into one of 3 predefined categories and included detailed definitions for each class, identical to those used by human annotators. The complete prompt can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The performance of the 2 trained BERT-based models&#x2014;SciBERT and Longformer&#x2014;was compared with the baseline GPT-4o model (subscription-based) and evaluated against labels assigned by human experts. Model outputs were compared using standard classification metrics: accuracy, precision, recall, <italic>F</italic><sub>1</sub>-score, and Cohen &#x03BA;, with the lattermost being used to assess the level of agreement between model predictions and the expert consensus categorization.</p></sec><sec id="s2-9"><title>Calibration Analysis</title><p>To evaluate the reliability of predicted probabilities, we conducted a calibration analysis on the fine-tuned SciBERT model. Predicted probabilities were compared with observed outcome frequencies across the 3 target classes using calibration plots and quantitative metrics, including expected calibration error and Brier score. Since Longformer achieved similar classification performance, calibration was performed only for SciBERT.</p></sec><sec id="s2-10"><title>Ethical Considerations</title><p>This study did not involve human participants or the collection of private data. A publicly available dataset of Cochrane PLSs was used, which can be accessed via the Open Science Framework [<xref ref-type="bibr" rid="ref46">46</xref>].</p><p>The use of publicly available data is exempt from ethics review in accordance with the University of Split School of Medicine guidelines and the Croatian Science Foundation project Professionalism in Health &#x2013; Decision making in practice and research (IP-2019-04-4882) [<xref ref-type="bibr" rid="ref47">47</xref>]. Therefore, institutional review board approval and informed consent were not required. The dataset used in this study was collected and shared under conditions that permit secondary analysis without additional consent requirements. The data contain no personally identifiable information. All analyzed PLSs are publicly accessible textual documents.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>Among the 4405 PLSs from our dataset, 429 (9.7%) had been manually categorized as conclusive, 1203 (27.3%) as inconclusive, and 2773 (63%) as unclear [<xref ref-type="bibr" rid="ref24">24</xref>]. These PLSs served as input data for our model. To address class imbalance, we applied random undersampling to the training set by selecting an equal number of PLSs from each class (n=429). This ensured that we had a balanced dataset and reduced the risk of biased model learning.</p><p>In classifying the PLSs, the SciBERT model achieved a balanced accuracy of 56.6%, with AUCROC scores of 0.91 for the conclusive, 0.67 for the inconclusive, and 0.75 for the unclear class. The Longformer model, meanwhile, demonstrated a balanced accuracy of 60.9%, with AUCROC scores of 0.86, 0.67, and 0.72 for the same classes, respectively. The receiver operating characteristic curves and confusion matrices are visualized in <xref ref-type="fig" rid="figure1">Figure 1</xref> and <xref ref-type="fig" rid="figure2">Figure 2</xref>, while <xref ref-type="table" rid="table1">Table 1</xref> presents a side-by-side comparison of the performance for both models across all classes.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Receiver operating characteristic (ROC) curves and the corresponding area under the curve of the receiver operating characteristic scores for each class (0=conclusive, 1=inconclusive, and 2=unclear); (A) SciBERT model and (B) Longformer model. Calculated and visualized using scikit-learn. AUC: area under the curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e72657_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Confusion matrix for the fine-tuned (A) SciBERT and (B) Longformer multiclass classification (0=conclusive, 1=inconclusive, and 2= unclear) used to visualize the performance of the conclusiveness classification algorithm (created using the scikit-learn). The confusion matrix evaluates multiclass performance by comparing the predicted classes with the actual classes. The diagonal elements represent the correct predictions.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e72657_fig02.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Per-class performance of the conclusiveness classification for Scientific Bidirectional Encoder Representations from Transformers (SciBERT) and Longformer models<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">AUCROC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> score</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Conclusive</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SciBERT</td><td align="left" valign="top">0.43</td><td align="left" valign="top">0.58</td><td align="left" valign="top">0.50</td><td align="left" valign="top">0.91</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Longformer</td><td align="left" valign="top">0.43</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.55</td><td align="left" valign="top">0.87</td></tr><tr><td align="left" valign="top" colspan="5">Inconclusive</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SciBERT</td><td align="left" valign="top">0.42</td><td align="left" valign="top">0.35</td><td align="left" valign="top">0.38</td><td align="left" valign="top">0.67</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Longformer</td><td align="left" valign="top">0.46</td><td align="left" valign="top">0.31</td><td align="left" valign="top">0.37</td><td align="left" valign="top">0.67</td></tr><tr><td align="left" valign="top" colspan="5">Unclear</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SciBERT</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.75</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Longformer</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.72</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Precision is the proportion of true positives relative to the total of true positives and false positives. Recall is the proportion of true positives relative to the total of true positives and false negatives. <italic>F</italic><sub>1</sub>-score is the harmonic mean of the precision and recall. Area under the curve of the receiver operating characteristic is the proportion of area below the receiver operating characteristic curve, which is the plot of the true positive rate against the false positive rate. The scores were obtained using the scikit-learn functions.</p></fn><fn id="table1fn2"><p><sup>b</sup>AUCROC: area under the curve of the receiver operating characteristic.</p></fn></table-wrap-foot></table-wrap><p>For comparison, the baseline SciBERT model achieved a balanced accuracy of only 42.2%, with AUCROC scores of 0.68 for the conclusive, 0.53 for the inconclusive, and 0.53 for the unclear class. The baseline Longformer&#x2019;s balanced accuracy was 39.0%, with AUCROC scores of 0.69, 0.56, and 0.54 for the same classes, respectively. However, the baseline Longformer was unstable across repeated runs and sometimes predicted only one class. This indicates that the frozen encoder was not able to provide features that separate the 3 categories well. The performance of all models is documented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-2"><title>Effect of the Training Set Size and Validation Split on Model Performance</title><p>The highest performance was observed with a 10% validation split, whereas 20% and 30% validation splits resulted in similarly reduced accuracies (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance of Scientific Bidirectional Encoder Representations from Transformers with varying training and validation splits.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Validation split</td><td align="left" valign="bottom">Training samples (per class), n</td><td align="left" valign="bottom">Balanced accuracy (%)</td></tr></thead><tbody><tr><td align="left" valign="top">10%</td><td align="left" valign="top">343</td><td align="left" valign="top">56.6</td></tr><tr><td align="left" valign="top">20%</td><td align="left" valign="top">257</td><td align="left" valign="top">53.1</td></tr><tr><td align="left" valign="top">30%</td><td align="left" valign="top">172</td><td align="left" valign="top">53.3</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Manual Validation and Comparison With ChatGPT Performance</title><p>Among the 213 PLSs from our additional verification dataset, 44 (20.7%) had been manually categorized as conclusive, 110 (51.6%) as inconclusive, and 59 (27.7%) as unclear. The Cohen &#x03BA; value between the experts was 0.57, indicating moderate agreement. The baseline GPT-4o model outperformed the trained BERT-based models (<xref ref-type="table" rid="table3">Table 3</xref>). SciBERT had the poorest performance overall, while ChatGPT had the strongest, with most of its conclusive predictions being correct. ChatGPT also achieved moderate agreement with the human classifications (Cohen &#x03BA;=0.57). This indicates that ChatGPT&#x2019;s classifications were as consistent with the expert consensus as an individual expert&#x2019;s were with one another. In contrast, SciBERT and Longformer demonstrated poor alignment with the reference classifications, with a Cohen &#x03BA; value of 0.03 and 0.19, respectively, suggesting worse-than-random agreement (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparative performance of fine-tuned Scientific Bidirectional Encoder Representations from Transformers (SciBERT) and Longformer models and baseline GPT-4o model on conclusiveness classification task.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric</td><td align="left" valign="bottom">Fine-tuned SciBERT</td><td align="left" valign="bottom">Fine-tuned Longformer</td><td align="left" valign="bottom">Baseline GPT-4o</td></tr></thead><tbody><tr><td align="left" valign="top">Precision</td><td align="left" valign="top">0.34</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.74</td></tr><tr><td align="left" valign="top">Recall</td><td align="left" valign="top">0.34</td><td align="left" valign="top">0.44</td><td align="left" valign="top">0.74</td></tr><tr><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.27</td><td align="left" valign="top">0.40</td><td align="left" valign="top">0.74</td></tr><tr><td align="left" valign="top">Accuracy (%)</td><td align="left" valign="top">34.3</td><td align="left" valign="top">44.1</td><td align="left" valign="top">74.2</td></tr><tr><td align="left" valign="top">Cohen &#x03BA;<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">0.03</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.57</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Predictions were compared against expert annotations, where each plain language summary was manually classified by 2 experts, with a third expert resolving any discrepancies.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Calibration Analysis</title><p>The calibration analysis was performed on the fine-tuned SciBERT model, which was overconfident for the &#x201C;conclusive&#x201D; class, poorly calibrated for the &#x201C;inconclusive&#x201D; class, and initially underconfident for the &#x201C;unclear&#x201D; class, although highly accurate when assigning high probabilities. Calibration plots and quantitative metrics, including expected calibration error and Brier scores (a measure of the average squared difference between predicted probability and actual outcome), are presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>Our results showed that transformer-based language models such as SciBERT and Longformer achieved modest performance in classifying Cochrane PLSs based on their level of conclusiveness. Both models were fine-tuned on a balanced dataset and evaluated using standard classification metrics, with Longformer achieving a balanced accuracy of 60.9%, compared with 56.6% for SciBERT. Both models performed best on the conclusive class, achieving relatively high AUCROC and <italic>F</italic><sub>1</sub>-scores. For the unclear class, SciBERT demonstrated stronger precision and recall. However, both models struggled to distinguish the inconclusive class, with the lowest <italic>F</italic><sub>1</sub>-scores and overlapping errors with the unclear category. Both models underperformed on the inconclusive class, with poor AUCROC and low precision and recall scores. These findings may indicate that conclusiveness is expressed in linguistically nuanced ways that the models are unable to detect and that there could be a semantic overlap between inconclusive and unclear PLSs. Both models were outperformed by ChatGPT, which achieved better accuracy and interrater agreement. This suggests that general-purpose LLMs may offer more reliable performance for this classification task, even without domain-specific fine-tuning. Notably, GPT-4o achieved the same Cohen &#x03BA; value (0.57) as the agreement between human experts, suggesting that it mirrors expert-level judgment and nuanced reasoning. This finding highlights the potential of general-purpose LLMs to approximate human evaluation in semantically complex classification tasks.</p></sec><sec id="s4-2"><title>Qualitative Insights From Manual Classification</title><p>During manual labeling of PLSs, we identified several challenges that may explain why the experts, the fine-tuned models, and ChatGPT all struggled with the classifications. First, there was some ambiguity between the &#x201C;inconclusive&#x201D; and &#x201C;unclear&#x201D; classes. For example, some PLSs did not clearly state whether the evidence was insufficient, which might be why both human assessors and models were uncertain when assigning these labels. Furthermore, the interpretation of the criteria for the &#x201C;conclusive&#x201D; class was occasionally ambiguous, particularly in cases where PLSs included recommendations but lacked clear statements about intervention effectiveness. This ambiguity likely made it difficult for human annotators to determine whether the conclusion was strong enough to classify a PLS as conclusive or inconclusive. Consequently, models trained on these labels may have inherited this ambiguity. We also observed that PLSs included expressions such as &#x201C;may help&#x201D; or &#x201C;probably works,&#x201D; which are common in scientific writing but can signal uncertainty. This nuance might have been difficult for models to detect, explaining their lower performance in differentiating between inconclusive and unclear statements. These findings suggest that better model instruction, such as through advanced prompt engineering, might help improve future performance of the GPT-4o model. They also highlight the need to incorporate linguistic features of uncertainty more explicitly into the training process.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>To our knowledge, there have been no studies on the automatic classification of Cochrane PLSs or full reviews based on the level of conclusiveness, although some studies examined machine learning techniques for making systematic review processes more efficient. For example, one study developed a randomized controlled trial classifier for Cochrane Reviews, a tool that discerns whether a selected study qualifies as a randomized controlled trial [<xref ref-type="bibr" rid="ref48">48</xref>]. In another, ChatGPT showed strong performance when used for abstractive summarization of longer texts, including news articles and public speeches [<xref ref-type="bibr" rid="ref49">49</xref>]. However, given the lack of specialized expertise in the field of medicine, ChatGPT does not always grasp the nuances of its terminology and sometimes struggles to recognize important information [<xref ref-type="bibr" rid="ref50">50</xref>]. In addition, one study found that LLMs sometimes generate factually inconsistent summaries, which could potentially harm readers [<xref ref-type="bibr" rid="ref51">51</xref>]. Yet, these challenges and the related legal and ethical issues should not discourage the use of LLMs but rather encourage further research and refinement of the technology.</p><p>The fine-tuned BERT-based models did not perform well in our classification task, indicating limitations in generalizing to nuanced language in PLSs. In contrast, general-purpose language models can perform better than fine-tuned models in some classification tasks, achieving Cohen &#x03BA; scores comparable to those of human experts. This is likely because they have been trained on much larger and more diverse text corpora and have more complex architectures, allowing them to better understand context and differentiate linguistic nuances [<xref ref-type="bibr" rid="ref52">52</xref>]. This is also in line with findings by Davidson and Chae [<xref ref-type="bibr" rid="ref53">53</xref>] that LLMs, particularly when fine-tuned on prompts that include explicit instructions, can outperform traditional supervised models in a variety of classification settings without task-specific training.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations. First, although we began with a relatively large dataset of 4405 PLSs, the &#x201C;conclusive&#x201D; class comprised only about 9.73% (n=429) of the total dataset. To address this issue, we applied random undersampling, which reduced the number of PLSs in the &#x201C;inconclusive&#x201D; and &#x201C;unclear&#x201D; classes. While this approach ensured balanced class representation, it also removed a substantial amount of data (2344/2773, 84.53% of the &#x201C;unclear&#x201D; class) that could have supported more robust model learning. This likely limited the models&#x2019; ability to learn the linguistic variability of the majority class. In contrast, GPT-4o was used in zero-shot inference settings and was not trained on our dataset, meaning that its performance could not be affected by the undersampling procedure that constrained the fine-tuned models. Alternative approaches, such as applying class-weighted loss functions or oversampling minority classes, may yield improved performance in future work.</p><p>Second, the dataset was based exclusively on PLSs of Cochrane Reviews, representing a single domain within evidence-based health literature. This may also limit the model&#x2019;s generalizability to other types of health communication.</p><p>Third, there were often very subtle linguistic differences between the inconclusive and unclear PLSs, which introduced noise in model classification. Some PLSs lacked clear phrasing or used expressions such as &#x201C;may help,&#x201D; and &#x201C;probably works&#x201D; that were difficult to interpret consistently, even for human annotators. This ambiguity likely contributed to the models&#x2019; difficulty in separating these two classes.</p><p>Fourth, although one researcher (IB) participated in and provided instruction for both the original 2019 annotation and the current one, the full annotator teams differed between the two studies. It is possible that there were subtle differences in how annotators interpreted or applied the criteria, raising the possibility of annotator drift between the original labels used for model training and the new labels used for verification. Such drift may partly account for the observed decline in performance of the fine-tuned models on the newer dataset.</p><p>Fifth, when comparing accuracy across different splits, it is important to note that altering the training and validation proportion also changes the size of the remaining test set. Because of this, the test benchmarks were not identical across these experiments. While this does not affect the qualitative pattern we observed, the varying test baseline may contribute to numerical differences in accuracy.</p><p>Additionally, the fine-tuned SciBERT model showed high volatility in validation loss across training epochs, including abrupt spikes prior to early stopping. This suggests that the model may not have reached a fully stable convergence point, possibly due to the limited size of the training set and the complexity of the task. Such fluctuations may have constrained the model&#x2019;s performance.</p><p>Finally, although we compared our fine-tuned models with GPT-4o, we did not use advanced prompt engineering or task-specific tuning. This likely underestimated ChatGPT&#x2019;s performance on this classification task.</p></sec><sec id="s4-5"><title>Future Work and Recommendations</title><p>Future studies should expand the dataset to include PLSs from non-Cochrane sources and from different health domains, which could improve model generalizability. First, although there was no meaningful difference in the performance of different training and validation splits, future work should also explore the impact of larger datasets on model performance. Second, adding task-specific fine-tuning and advanced prompt engineering of LLMs such as GPT-4o could improve classification accuracy even more. Third, models may be able to more successfully differentiate between inconclusive and unclear classes by incorporating linguistic features that capture uncertainty and conclusiveness cues.</p><p>In the long term, implementing a general-purpose LLM, such as ChatGPT, within Cochrane platforms (eg, the RevMan Web dashboard) could assist authors in ensuring that their conclusions are clear and guide readers through reviews according to their conclusiveness level. Additionally, LLMs could support users of the Cochrane Library by offering filters or tags that group PLSs by conclusiveness. However, additional model validation and ethical review should precede these real-world applications.</p></sec><sec id="s4-6"><title>Conclusions</title><p>We explored the use of 2 fine-tuned transformer-based models&#x2014;SciBERT and Longformer&#x2014;for classifying Cochrane PLS according to their level of conclusiveness. Both models demonstrated modest internal performance but poor generalization to newly published PLSs, particularly in distinguishing between inconclusive and unclear categories, likely due to their semantic overlap. An empirical analysis of different training and validation splits confirmed that larger training sets improve model performance, although the gains were modest. Most importantly, both models were outperformed by ChatGPT, which as a general-purpose language model, achieved higher accuracy (74%) and agreement with expert annotations, suggesting that state-of-the-art LLMs hold greater potential for health care information dissemination.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This study was funded by the Croatian Science Foundation &#x201C;Professionalism in Health &#x2013; Decision making in practice and research&#x201D; (ProDeM) under grant IP-2019-04-4882. The funder had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p></sec><sec><title>Data Availability</title><p>The datasets used and analyzed during the study are publicly available on the Open Science Framework website [<xref ref-type="bibr" rid="ref46">46</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: A Mijatovi&#x0107;, IB, LU</p><p>Data curation: A Mijatovi&#x0107;</p><p>Formal analysis: A Mijatovi&#x0107;, B&#x0106;, IB, LU, NB, RB</p><p>Funding acquisition: A Maru&#x0161;i&#x0107;</p><p>Investigation: A Mijatovi&#x0107;</p><p>Methodology: A Mijatovi&#x0107;</p><p>Project administration: A Maru&#x0161;i&#x0107;</p><p>Resources: IB</p><p>Software: A Mijatovi&#x0107;</p><p>Supervision: A Maru&#x0161;i&#x0107;</p><p>Validation: A Mijatovi&#x0107;, B&#x0106;, NB, RB</p><p>Visualization: A Mijatovi&#x0107;</p><p>Writing&#x2014;original draft: A Mijatovi&#x0107;</p><p>Writing&#x2014;review and editing: A Maru&#x0161;i&#x0107;, B&#x0106;, IB, LU, NB, RB</p><p>All authors approved the submitted version and take accountability for the work.</p></fn><fn fn-type="conflict"><p>A Maru&#x0161;i&#x0107; and NB are active Cochrane members. All other authors declare no other conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AUCROC</term><def><p>area under the curve of the receiver operating characteristic</p></def></def-item><def-item><term id="abb3">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb6">PLS</term><def><p>plain language summary</p></def></def-item><def-item><term id="abb7">SciBERT</term><def><p>Scientific Bidirectional Encoder Representations from Transformers</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Whiting</surname><given-names>P</given-names> </name><name name-style="western"><surname>Davenport</surname><given-names>C</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Deeks</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Bossuyt</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Leeflang</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Takwoingi</surname><given-names>Y</given-names> </name></person-group><article-title>Writing a plain language summary</article-title><source>Cochrane Handbook for Systematic Reviews of Diagnostic Test Accuracy</source><year>2023</year><publisher-name>John Wiley &#x0026; Sons</publisher-name></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Higgins</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chandler</surname><given-names>J</given-names> </name><etal/></person-group><source>Cochrane Handbook for Systematic Reviews of Interventions</source><year>2023</year><access-date>2026-03-30</access-date><publisher-name>Cochrane Collaboration</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.cochrane.org/authors/handbooks-and-manuals/handbook">https://www.cochrane.org/authors/handbooks-and-manuals/handbook</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Weiss</surname><given-names>BD</given-names> </name></person-group><source>Health Literacy and Patient Safety: Help Patients Understand: Manual for Clinicians</source><year>2007</year><access-date>2026-03-30</access-date><edition>2</edition><publisher-name>American Medical Association</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://books.google.be/books?id=quJaYgEACAAJ">https://books.google.be/books?id=quJaYgEACAAJ</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sch&#x00FC;nemann</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Vist</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Higgins</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Santesso</surname><given-names>N</given-names> </name><name name-style="western"><surname>Deeks</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Glasziou</surname><given-names>P</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Higgins</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chandler</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cumpston</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>T</given-names> </name><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Welch</surname><given-names>VA</given-names> </name></person-group><article-title>Chapter 15: Interpreting results and drawing conclusions</article-title><source>Cochrane Handbook for Systematic Reviews of Interventions Version 6.5</source><year>2024</year><publisher-name>Cochrane Collaboration</publisher-name></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Glenton</surname><given-names>C</given-names> </name><name name-style="western"><surname>Nilsen</surname><given-names>ES</given-names> </name><name name-style="western"><surname>Sporst&#x00F8;l F&#x00F8;nhus</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goudie</surname><given-names>S</given-names> </name><name name-style="western"><surname>Noonan</surname><given-names>E</given-names> </name></person-group><article-title>How to write a plain language summary of a Cochrane intervention review (version 10)</article-title><source>Cochrane Norway</source><access-date>2026-03-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cochrane.no/how-write-plain-language-summary">https://www.cochrane.no/how-write-plain-language-summary</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Lasserson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Santesso</surname><given-names>N</given-names> </name><name name-style="western"><surname>Cumpston</surname><given-names>M</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>R</given-names> </name><name name-style="western"><surname>&#x00D3;g&#x00E1;in</surname><given-names>ON</given-names> </name></person-group><article-title>Incorporating GRADE in Cochrane reviews: feedback from the CEU screening programme</article-title><year>2016</year><access-date>2026-03-22</access-date><publisher-name>Cochrane</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.cochrane.org/sites/default/files/uploads/PDFs/MECIR/Incorporating%20GRADE%20in%20Cochrane%20Reviews.pdf">https://www.cochrane.org/sites/default/files/uploads/PDFs/MECIR/Incorporating%20GRADE%20in%20Cochrane%20Reviews.pdf</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Santesso</surname><given-names>N</given-names> </name><name name-style="western"><surname>Glenton</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dahm</surname><given-names>P</given-names> </name><etal/></person-group><article-title>GRADE guidelines 26: informative statements to communicate the findings of systematic reviews of interventions</article-title><source>J Clin Epidemiol</source><year>2020</year><month>03</month><volume>119</volume><fpage>126</fpage><lpage>135</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2019.10.014</pub-id><pub-id pub-id-type="medline">31711912</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Changiz</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yousefy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fakhari</surname><given-names>M</given-names> </name></person-group><article-title>Research utilization process model: a cyclical, spiral, and developmental process to provide conclusive research knowledge in health professions education</article-title><source>Med J Islam Repub Iran</source><year>2020</year><volume>34</volume><fpage>79</fpage><pub-id pub-id-type="doi">10.34171/mjiri.34.79</pub-id><pub-id pub-id-type="medline">33306047</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kurtzman</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Greene</surname><given-names>J</given-names> </name></person-group><article-title>Effective presentation of health care performance information for consumer decision making: a systematic review</article-title><source>Patient Educ Couns</source><year>2016</year><month>01</month><volume>99</volume><issue>1</issue><fpage>36</fpage><lpage>43</lpage><pub-id pub-id-type="doi">10.1016/j.pec.2015.07.030</pub-id><pub-id pub-id-type="medline">26277826</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gray</surname><given-names>JA</given-names> </name></person-group><article-title>Discussion and conclusion</article-title><source>AME Med J</source><year>2019</year><volume>4</volume><fpage>26</fpage><pub-id pub-id-type="doi">10.21037/amj.2019.04.05</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bujnowska-Fedak</surname><given-names>MM</given-names> </name><name name-style="western"><surname>W&#x0119;gierek</surname><given-names>P</given-names> </name></person-group><article-title>The impact of online health information on patient health behaviours and making decisions concerning health</article-title><source>Int J Environ Res Public Health</source><year>2020</year><month>01</month><day>31</day><volume>17</volume><issue>3</issue><fpage>880</fpage><pub-id pub-id-type="doi">10.3390/ijerph17030880</pub-id><pub-id pub-id-type="medline">32023828</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Goonawardene</surname><given-names>N</given-names> </name></person-group><article-title>Internet health information seeking and the patient-physician relationship: a systematic review</article-title><source>J Med Internet Res</source><year>2017</year><month>01</month><day>19</day><volume>19</volume><issue>1</issue><fpage>e9</fpage><pub-id pub-id-type="doi">10.2196/jmir.5729</pub-id><pub-id pub-id-type="medline">28104579</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Murray</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Pollack</surname><given-names>L</given-names> </name><etal/></person-group><article-title>The impact of health information on the internet on the physician-patient relationship: patient perceptions</article-title><source>Arch Intern Med</source><year>2003</year><month>07</month><day>28</day><volume>163</volume><issue>14</issue><fpage>1727</fpage><lpage>1734</lpage><pub-id pub-id-type="doi">10.1001/archinte.163.14.1727</pub-id><pub-id pub-id-type="medline">12885689</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tonsaker</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bartlett</surname><given-names>G</given-names> </name><name name-style="western"><surname>Trpkov</surname><given-names>C</given-names> </name></person-group><article-title>Health information on the internet: gold mine or minefield?</article-title><source>Can Fam Physician</source><year>2014</year><month>05</month><volume>60</volume><issue>5</issue><fpage>407</fpage><lpage>408</lpage><pub-id pub-id-type="medline">24828994</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>RX</given-names> </name></person-group><article-title>How online searches fuel health anxiety: investigating the link between health-related searches, health anxiety, and future intention</article-title><source>Comput Human Behav</source><year>2022</year><month>11</month><volume>136</volume><fpage>107384</fpage><pub-id pub-id-type="doi">10.1016/j.chb.2022.107384</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>A</given-names> </name><name name-style="western"><surname>Baumann</surname><given-names>E</given-names> </name><name name-style="western"><surname>Dierks</surname><given-names>ML</given-names> </name></person-group><article-title>Cyberchondria - a new behavioral syndrome? [Article in German]</article-title><source>Psychother Psychosom Med Psychol</source><year>2021</year><month>06</month><volume>71</volume><issue>6</issue><fpage>243</fpage><lpage>255</lpage><pub-id pub-id-type="doi">10.1055/a-1348-8059</pub-id><pub-id pub-id-type="medline">34102694</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Doherty-Torstrick</surname><given-names>ER</given-names> </name><name name-style="western"><surname>Walton</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Fallon</surname><given-names>BA</given-names> </name></person-group><article-title>Cyberchondria: parsing health anxiety from online behavior</article-title><source>Psychosomatics</source><year>2016</year><volume>57</volume><issue>4</issue><fpage>390</fpage><lpage>400</lpage><pub-id pub-id-type="doi">10.1016/j.psym.2016.02.002</pub-id><pub-id pub-id-type="medline">27044514</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x0160;uto</surname><given-names>J</given-names> </name><name name-style="western"><surname>Maru&#x0161;i&#x0107;</surname><given-names>A</given-names> </name><name name-style="western"><surname>Buljan</surname><given-names>I</given-names> </name></person-group><article-title>Linguistic analysis of plain language summaries and corresponding scientific summaries of Cochrane systematic reviews about oncology interventions</article-title><source>Cancer Med</source><year>2023</year><month>05</month><volume>12</volume><issue>9</issue><fpage>10950</fpage><lpage>10960</lpage><pub-id pub-id-type="doi">10.1002/cam4.5825</pub-id><pub-id pub-id-type="medline">36951519</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mimouni</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mimouni</surname><given-names>F</given-names> </name><name name-style="western"><surname>Segev</surname><given-names>F</given-names> </name></person-group><article-title>Conclusiveness of the Cochrane Eye and Vision Group Reviews</article-title><source>BMC Res Notes</source><year>2015</year><month>06</month><day>16</day><volume>8</volume><fpage>242</fpage><pub-id pub-id-type="doi">10.1186/s13104-015-1221-x</pub-id><pub-id pub-id-type="medline">26076817</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chuai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name></person-group><article-title>Conclusiveness of the Cochrane reviews in gynaecological cancer: a systematic analysis</article-title><source>J Int Med Res</source><year>2015</year><month>06</month><volume>43</volume><issue>3</issue><fpage>311</fpage><lpage>315</lpage><pub-id pub-id-type="doi">10.1177/0300060515574922</pub-id><pub-id pub-id-type="medline">25870179</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mandel</surname><given-names>D</given-names> </name><name name-style="western"><surname>Littner</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mimouni</surname><given-names>FB</given-names> </name><name name-style="western"><surname>Lubetzky</surname><given-names>R</given-names> </name></person-group><article-title>Conclusiveness of the Cochrane Neonatal Reviews: a systematic analysis</article-title><source>Acta Paediatr</source><year>2006</year><month>10</month><volume>95</volume><issue>10</issue><fpage>1209</fpage><lpage>1212</lpage><pub-id pub-id-type="doi">10.1080/08035250600580537</pub-id><pub-id pub-id-type="medline">16982491</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lubetzky</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mimouni</surname><given-names>FB</given-names> </name><name name-style="western"><surname>Marom</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mandel</surname><given-names>D</given-names> </name></person-group><article-title>Conclusiveness of the Cochrane Reviews in pediatric-gastroenterology: a systematic analysis</article-title><source>Eur J Gastroenterol Hepatol</source><year>2013</year><month>02</month><volume>25</volume><issue>2</issue><fpage>252</fpage><lpage>254</lpage><pub-id pub-id-type="doi">10.1097/MEG.0b013e32835a1083</pub-id><pub-id pub-id-type="medline">23044810</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Conway</surname><given-names>A</given-names> </name><name name-style="western"><surname>Conway</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Soalheira</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sutherland</surname><given-names>J</given-names> </name></person-group><article-title>High quality of evidence is uncommon in Cochrane systematic reviews in Anaesthesia, Critical Care and Emergency Medicine</article-title><source>Eur J Anaesthesiol</source><year>2017</year><month>12</month><volume>34</volume><issue>12</issue><fpage>808</fpage><lpage>813</lpage><pub-id pub-id-type="doi">10.1097/EJA.0000000000000691</pub-id><pub-id pub-id-type="medline">29095726</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bani&#x0107;</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fidahi&#x0107;</surname><given-names>M</given-names> </name><name name-style="western"><surname>&#x0160;uto</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Conclusiveness, linguistic characteristics and readability of Cochrane plain language summaries of intervention reviews: a cross-sectional study</article-title><source>BMC Med Res Methodol</source><year>2022</year><month>09</month><day>10</day><volume>22</volume><issue>1</issue><fpage>240</fpage><pub-id pub-id-type="doi">10.1186/s12874-022-01721-7</pub-id><pub-id pub-id-type="medline">36088293</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bird</surname><given-names>S</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>E</given-names> </name><name name-style="western"><surname>Loper</surname><given-names>E</given-names> </name></person-group><source>Natural Language Processing with Python</source><year>2009</year><publisher-name>O&#x2019;Reilly Media</publisher-name><pub-id pub-id-type="other">9780596516499</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khurana</surname><given-names>D</given-names> </name><name name-style="western"><surname>Koli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Khatter</surname><given-names>K</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name></person-group><article-title>Natural language processing: state of the art, current trends and challenges</article-title><source>Multimed Tools Appl</source><year>2023</year><volume>82</volume><issue>3</issue><fpage>3713</fpage><lpage>3744</lpage><pub-id pub-id-type="doi">10.1007/s11042-022-13428-4</pub-id><pub-id pub-id-type="medline">35855771</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>A complete guide to natural language processing</article-title><source>DeepLearning.AI</source><access-date>2024-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.deeplearning.ai/resources/natural-language-processing">https://www.deeplearning.ai/resources/natural-language-processing</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Srinivasan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sitaram</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ganu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dandapat</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bali</surname><given-names>K</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>M</given-names> </name></person-group><article-title>Predicting the performance of multilingual NLP models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 21, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2110.08875</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tinn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title><source>ACM Trans Comput Healthcare</source><year>2021</year><month>10</month><day>15</day><volume>3</volume><issue>1</issue><fpage>1</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1145/3458754</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Beltagy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Lo</surname><given-names>K</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>A</given-names> </name></person-group><article-title>SciBERT: a pretrained language model for scientific text</article-title><conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name><conf-date>Nov 3-7, 2019</conf-date><pub-id pub-id-type="doi">10.18653/v1/D19-1371</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Beltagy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>A</given-names> </name></person-group><article-title>Longformer: the long-document transformer</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 10, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2004.05150</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brali&#x0107;</surname><given-names>N</given-names> </name><name name-style="western"><surname>Buljan</surname><given-names>I</given-names> </name></person-group><article-title>The association between research design and the perceived treatment effectiveness: a cross-sectional study</article-title><source>Front Med (Lausanne)</source><year>2023</year><month>12</month><day>22</day><volume>10</volume><fpage>1220999</fpage><pub-id pub-id-type="doi">10.3389/fmed.2023.1220999</pub-id><pub-id pub-id-type="medline">38196834</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 11, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Causevic</surname><given-names>S</given-names> </name></person-group><article-title>Evolution of large language models&#x2014;BERT, GPT3, MUM, and PaLM</article-title><source>Medium</source><year>2022</year><access-date>2026-03-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://medium.com/data-science/self-supervised-transformer-models-bert-gpt3-mum-and-paml-2b5e29ea0c26">https://medium.com/data-science/self-supervised-transformer-models-bert-gpt3-mum-and-paml-2b5e29ea0c26</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>Fine-tuning</article-title><source>Hugging Face</source><access-date>2026-03-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/docs/transformers/training">https://huggingface.co/docs/transformers/training</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Winastwan</surname><given-names>R</given-names> </name></person-group><article-title>Text classification with BERT in PyTorch</article-title><source>Towards Data Science</source><year>2021</year><access-date>2024-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f">https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>What is transfer learning?</article-title><source>GeeksforGeeks</source><access-date>2024-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.geeksforgeeks.org/ml-introduction-to-transfer-learning">https://www.geeksforgeeks.org/ml-introduction-to-transfer-learning</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Oztel</surname><given-names>I</given-names> </name><name name-style="western"><surname>Yolcu</surname><given-names>G</given-names> </name><name name-style="western"><surname>Oz</surname><given-names>C</given-names> </name></person-group><article-title>Performance comparison of transfer learning and training from scratch approaches for deep facial expression recognition</article-title><conf-name>2019 4th International Conference on Computer Science and Engineering (UBMK)</conf-name><conf-date>Sep 11-15, 2019</conf-date><pub-id pub-id-type="doi">10.1109/UBMK.2019.8907203</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Paszke</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gross</surname><given-names>S</given-names> </name><name name-style="western"><surname>Massa</surname><given-names>F</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Wallach</surname><given-names>H</given-names> </name><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Beygelzimer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alche&#x2019;-Buc</surname><given-names>FD</given-names> </name><name name-style="western"><surname>Fox</surname><given-names>E</given-names> </name><name name-style="western"><surname>Garnett</surname><given-names>R</given-names> </name></person-group><article-title>PyTorch: an imperative style, high-performance deep learning library</article-title><source>Advances in Neural Information Processing Systems 32</source><year>2019</year><publisher-name>Curran Associates, Inc</publisher-name><fpage>8024</fpage><lpage>8035</lpage></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kluyver</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ragan-Kelley</surname><given-names>B</given-names> </name><name name-style="western"><surname>P&#x00E9;rez</surname><given-names>F</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Loizides</surname><given-names>F</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>B</given-names> </name></person-group><article-title>Jupyter notebooks&#x2014;a publishing format for reproducible computational workflows</article-title><source>Positioning and Power in Academic Publishing: Players, Agents and Agendas</source><year>2016</year><publisher-name>IOS Press</publisher-name><fpage>87</fpage><lpage>90</lpage><pub-id pub-id-type="doi">10.3233/978-1-61499-649-1-87</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="web"><article-title>Models</article-title><source>Hugging Face</source><access-date>2024-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/models">https://huggingface.co/models</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="web"><article-title>AdamW</article-title><source>PyTorch</source><access-date>2024-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html#adamw">https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html#adamw</ext-link></comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="web"><article-title>CrossEntropyLoss</article-title><source>PyTorch</source><access-date>2024-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html">https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Reed</surname><given-names>R</given-names> </name><name name-style="western"><surname>Marks</surname><given-names>RJ</given-names> </name></person-group><source>Neural Smithing: Supervised Learning in Feedforward Artificial Neural Networks</source><year>1999</year><publisher-name>MIT Press</publisher-name><pub-id pub-id-type="other">9780262181907</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="web"><article-title>scikit-learn: machine learning in Python</article-title><source>scikit-learn</source><access-date>2024-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://scikit-learn.org/stable">https://scikit-learn.org/stable</ext-link></comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="web"><article-title>Readability, linguistic characteristics and conclusiveness of Cochrane plain language summaries of intervention reviews: a cross-sectional study</article-title><source>OSF</source><access-date>2026-03-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://osf.io/qvu3a/overview">https://osf.io/qvu3a/overview</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="web"><article-title>Code of ethics of the Croatian Science Foundation</article-title><source>Hrvatska Zaklada za Znanost</source><year>2020</year><access-date>2026-03-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://hrzz.hr/wp-content/uploads/2020/08/HRZZ-Code-of-Ethics.pdf">https://hrzz.hr/wp-content/uploads/2020/08/HRZZ-Code-of-Ethics.pdf</ext-link></comment></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>McDonald</surname><given-names>S</given-names> </name><name name-style="western"><surname>Noel-Storr</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Machine learning reduced workload with minimal risk of missing studies: development and evaluation of a randomized controlled trial classifier for Cochrane Reviews</article-title><source>J Clin Epidemiol</source><year>2021</year><month>05</month><volume>133</volume><fpage>140</fpage><lpage>151</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.11.003</pub-id><pub-id pub-id-type="medline">33171275</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Povey</surname><given-names>N</given-names> </name></person-group><article-title>ChatGPT: abstractive text summarization</article-title><source>Medium</source><year>2022</year><access-date>2024-09-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://medium.com/@nadirapovey/chatgpt-text-summarization-44f768222a4c">https://medium.com/@nadirapovey/chatgpt-text-summarization-44f768222a4c</ext-link></comment></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dave</surname><given-names>T</given-names> </name><name name-style="western"><surname>Athaluri</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name></person-group><article-title>ChatGPT in medicine: an overview of its applications, advantages, limitations, future prospects, and ethical considerations</article-title><source>Front Artif Intell</source><year>2023</year><volume>6</volume><fpage>1169595</fpage><pub-id pub-id-type="doi">10.3389/frai.2023.1169595</pub-id><pub-id pub-id-type="medline">37215063</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Idnay</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Evaluating large language models on medical evidence summarization</article-title><source>NPJ Digit Med</source><year>2023</year><month>08</month><day>24</day><volume>6</volume><issue>1</issue><fpage>158</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00896-7</pub-id><pub-id pub-id-type="medline">37620423</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><name name-style="western"><surname>Subbiah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaplan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dhariwal</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  May 28, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Davidson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chae</surname><given-names>Y</given-names> </name></person-group><article-title>Large language models for text classification: from zero-shot learning to instruction-tuning</article-title><source>Sociol Methods Res</source><year>2025</year><pub-id pub-id-type="doi">10.1177/00491241251325243</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary material presenting detailed model training procedures, experimental setups, and full performance metrics for SciBERT and Longformer models.</p><media xlink:href="medinform_v14i1e72657_app1.docx" xlink:title="DOCX File, 764 KB"/></supplementary-material></app-group></back></article>