<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e63267</article-id><article-id pub-id-type="doi">10.2196/63267</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Transformer-Based Language Models for Group Randomized Trial Classification in Biomedical Literature: Model Development and Validation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Aghaarabi</surname><given-names>Elaheh</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Murray</surname><given-names>David</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Office of Disease Prevention, National Institutes of Health</institution><addr-line>6705 Rockledge Dr</addr-line><addr-line>Bethesda</addr-line><addr-line>MD</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Yu</surname><given-names>Huizi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lan</surname><given-names>Mengfei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Elaheh Aghaarabi, MSc, Office of Disease Prevention, National Institutes of Health, 6705 Rockledge Dr, Bethesda, MD, 20892, United States, 1 3014964000; <email>elaheh.a.arabi@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>9</day><month>5</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e63267</elocation-id><history><date date-type="received"><day>14</day><month>06</month><year>2024</year></date><date date-type="rev-recd"><day>02</day><month>02</month><year>2025</year></date><date date-type="accepted"><day>06</day><month>02</month><year>2025</year></date></history><copyright-statement>&#x00A9; Elaheh Aghaarabi, David Murray. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 9.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e63267"/><abstract><sec><title>Background</title><p>For the public health community, monitoring recently published articles is crucial for staying informed about the latest research developments. However, identifying publications about studies with specific research designs from the extensive body of public health publications is a challenge with the currently available methods.</p></sec><sec><title>Objective</title><p>Our objective is to develop a fine-tuned pretrained language model that can accurately identify publications from clinical trials that use a group- or cluster-randomized trial (GRT), individually randomized group-treatment trial (IRGT), or stepped wedge group- or cluster-randomized trial (SWGRT) design within the biomedical literature.</p></sec><sec sec-type="methods"><title>Methods</title><p>We fine-tuned the BioMedBERT language model using a dataset of biomedical literature from the Office of Disease Prevention at the National Institute of Health. The model was trained to classify publications into three categories of clinical trials that use nested designs. The model performance was evaluated on unseen data and demonstrated high sensitivity and specificity for each class.</p></sec><sec sec-type="results"><title>Results</title><p>When our proposed model was tested for generalizability with unseen data, it delivered high sensitivity and specificity for each class as follows: negatives (0.95 and 0.93), GRTs (0.94 and 0.90), IRGTs (0.81 and 0.97), and SWGRTs (0.96 and 0.99), respectively.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our work demonstrates the potential of fine-tuned, domain-specific language models to accurately identify publications reporting on complex and specialized study designs, addressing a critical need in the public health research community. This model offers a valuable tool for the public health community to directly identify publications from clinical trials that use one of the three classes of nested designs.</p></sec></abstract><kwd-group><kwd>document classification</kwd><kwd>machine learning</kwd><kwd>natural language processing</kwd><kwd>randomized trials</kwd><kwd>transformer</kwd><kwd>AI</kwd><kwd>artificial intelligence</kwd><kwd>clinical trials</kwd><kwd>language model</kwd><kwd>development</kwd><kwd>dataset</kwd><kwd>biomedical</kwd><kwd>model</kwd><kwd>tool</kwd><kwd>trial</kwd><kwd>public health</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Researchers need to identify publications from trials that use nested designs to access evidence relevant to community-level interventions and make informed decisions about public health strategies, as well as to understand the effectiveness of these interventions in improving health outcomes and reducing health disparities within populations. Additionally, they may require these publications to conduct meta-analyses and systematic reviews.</p><p>There are three important classes of nested designs widely used in clinical trials [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. The parallel group- or cluster-randomized trial (GRT) involves the randomization of groups or clusters to study arms with observations taken using members of those groups or clusters [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. This design is widely used to evaluate interventions that are delivered to groups or clusters that modify the physical or social environment, or that cannot be delivered to individuals without the substantial risk of contamination. The stepped wedge group or cluster-randomized trial (SWGRT) involves the randomization of groups or clusters to sequences; all groups or clusters begin in the control arm and transition to the intervention arm on a schedule determined by their sequence so that by the end of the trial, all groups or clusters are in the intervention arm [<xref ref-type="bibr" rid="ref9">9</xref>]. The individually randomized group-treatment (IRGT) trial involves the random assignment of individuals to study arms but delivery of the intervention in a group-based format or using shared intervention agents [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. All three nested designs have design, analytic, and sample size challenges not found in the traditional randomized clinical trial [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Currently, most public health researchers, including ourselves, use manual searches to identify GRTs, IRGTs, or SWGRTs, because, so far as we are aware, there are no automated methods for identifying published papers using these designs. However, manual searches can easily miss many qualifying publications due to the complexity of search parameters and the lack of consistent reporting. Document classification using machine learning and natural language processing techniques offers a more promising approach for categorizing documents into predefined groups.</p><p>Garcia et al [<xref ref-type="bibr" rid="ref12">12</xref>] enhanced automatic document classification in the biomedical domain by leveraging Wikipedia knowledge to create bag-of-concepts representations, resulting in performance gains over traditional bag-of-words approaches in both single-label and multi-label classification tasks. Cohen [<xref ref-type="bibr" rid="ref13">13</xref>] also proposed a biomedical text classifier, which integrates document words, MeSH terms, and normalized biological entity identifiers.</p><p>Previous studies have demonstrated the utility of machine learning approaches in identifying randomized controlled trials (RCTs) from biomedical literature databases. Marshal et al [<xref ref-type="bibr" rid="ref14">14</xref>] used machine learning models, including convolutional neural networks, support vector machines, and ensemble models to identify RCT publications. Al-Jaishi et al [<xref ref-type="bibr" rid="ref15">15</xref>] addressed the challenge of accurately identifying GRTs reports from bibliographic citations by leveraging static embedding techniques, developing and validating machine learning algorithms for information retrieval.</p><p>While these studies have focused on identifying specific types of RCTs, such as conventional RCTs and GRTs, our research aims to extend this approach to the identification of diverse categories of randomized trials, including GRTs, IRGTs, and SWGRTs. To achieve this, we propose a novel approach leveraging fine-tuned language models, specifically the pretrained BioMedBERT model, trained on a dataset of biomedical literature curated by the Office of Disease Prevention at the National Institute of Health.</p><p>Large language models represent an ideal choice for the development of biomedical text classifiers due to their capacity to grasp the contextual nuances within the data. Pretrained transformer language models, like bidirectional encoder representations from transformers (BERT) [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>], have outperformed the existing deep neural network models, including convolutional neural networks and recurrent neural networks. Examples of transformer-based models trained on biomedical data include BioBERT [<xref ref-type="bibr" rid="ref19">19</xref>], BioLinkBERT [<xref ref-type="bibr" rid="ref20">20</xref>], BlueBERT [<xref ref-type="bibr" rid="ref21">21</xref>], and BioMedBERT [<xref ref-type="bibr" rid="ref22">22</xref>], which are pretrained on biomedical literature and clinical text.</p><p>While these pretrained models are readily applicable to common tasks due to their training on biomedical data, classifying scientific literature presents a unique challenge because scientific literature encompasses a diverse range of topics, writing styles, and research fields. Identifying and categorizing clinical trials into highly specialized categories is especially challenging, even for human coders. Leveraging transfer learning and fine-tuning a pretrained language model allows the machine learning platform to learn and adapt to the particular context and vocabulary of these types of documents, enhancing its effectiveness in tasks such as document classification, information extraction, and summarization [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>To our knowledge, there is currently no transformer-based language model fine-tuned to identify clinical trial publications based on nested designs. In our method, we leverage BioMedBERT [<xref ref-type="bibr" rid="ref22">22</xref>], a model that has been pretrained from scratch using abstracts from PubMed and full-text articles from PubMedCentral. We fine-tuned BioMedBERT using labeled clinical trials, with a specific focus on distinguishing various types of clinical trial publications, including GRTs, IRGTs, and SWGRTs. The fine-tuning process involved training on a carefully curated dataset comprising a substantial number of GRTs, IRGTs, and SWGRTs. The outcome of this fine-tuning process is a model that provides a high level of sensitivity and specificity in classifying and differentiating various types of randomized trial publications.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Traditional Machine Learning Models (Baseline Model)</title><p>In our study, we initially established a baseline model for classifying publications using traditional machine learning and word embedding techniques to demonstrate the effectiveness of employing a transformer-based model in identifying publications based on nested designs. To create the baseline model, we employed FastText (Facebook AI Research) to generate word embeddings, followed by a logistic regression model. Logistic regression has been widely recognized in the literature as an effective classifier for text data due to its simplicity, interpretability, and robust performance across various domains [<xref ref-type="bibr" rid="ref23">23</xref>]. To enhance the model&#x2019;s capability to process biomedical text data, we leveraged pretrained FastText embeddings specifically trained on PubMed and MIMIC-III data, known as BioWordVec [<xref ref-type="bibr" rid="ref24">24</xref>], to extract meaningful features from titles and abstracts of publications. Subsequently, we used these extracted embeddings to train a logistic regression model for the classification of publications.</p></sec><sec id="s2-2"><title>Evaluation Metrics</title><p>There are various metrics available to evaluate the performance of a classifier. While the area under the curve is commonly used in binary classifiers, accuracy can be helpful for balanced evaluation datasets. However, in our case, where the model serves as a multiclass classifier with imbalanced data, the F<sub>1</sub>-score emerges as the most reliable metric [<xref ref-type="bibr" rid="ref25">25</xref>]. Our primary objective was to fine-tune an existing pretrained language model to maximize the F<sub>1</sub>-score on a validation dataset.</p><p>The F<sub>1</sub>-score is the harmonic mean of precision and recall. Precision is the ratio of correctly predicted positive observations to the total predicted positives. It measures the accuracy of the positive predictions made by the model. High precision ensures that when our model predicts an article belongs to a nested design group, it is highly likely to be accurate. This is crucial in applications where precision contributes to the trustworthiness of the classification outcomes, such as systemically classifying research publications. Recall, which is the same as sensitivity, is vital when the cost of false negatives is high. A high recall ensures that our model effectively captures a comprehensive set of articles within each predefined group.</p><p>The F<sub>1</sub>-score is particularly useful in situations where there is an imbalance between the classes or when there is an equal importance placed on precision and recall. It is a metric that balances the trade-off between precision and recall, providing a single value that reflects the overall performance of the model.</p><p>Our dataset is imbalanced with fewer examples in the IRGT and SWGRT classes, and accuracy alone can be misleading. Precision, recall, and F1-score provide a more nuanced view of a model&#x2019;s effectiveness, especially in identifying the strengths and weaknesses associated with false positives and false negatives.</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mtext>True Positives</mml:mtext><mml:mrow><mml:mtext>True Positives</mml:mtext><mml:mo>+</mml:mo><mml:mtext>False Positives</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="equWL2"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mtext>Recall</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mtext>True Positives</mml:mtext><mml:mrow><mml:mtext>True Positives</mml:mtext><mml:mo>+</mml:mo><mml:mtext>False Negatives</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="equWL3"><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mtext>F1 Score</mml:mtext><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mfrac><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>&#x00D7;</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>+</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Specificity serves as a crucial metric for assessing the number of false positives within a model. It measures the ability of the model to correctly identify negative instances, thus providing insight into the model&#x2019;s performance in avoiding false-positive predictions. Although it was not initially used as an evaluation metric during our experimental phase and model development, it was calculated to provide a comprehensive assessment of performance, particularly in gauging the rate of false positives.</p><disp-formula id="E9"><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mtext>Specificity</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mtext>True Negatives</mml:mtext><mml:mrow><mml:mtext>True Negatives</mml:mtext><mml:mo>+</mml:mo><mml:mtext>False Positives</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><p>The weighted average considers the class imbalance in the dataset by considering the contribution of each class proportional to the number of instances in that class. In other words, classes with more instances have a greater impact on the overall metric than classes with fewer instances.</p><disp-formula id="equWL5"><mml:math id="eqn5"><mml:mi>W</mml:mi><mml:mi>e</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>g</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2219;</mml:mo><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:math></disp-formula><p>Where</p><p><inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the metric value (eg, precision, recall, F1-score) for class,  <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the number of instances in class i,  <inline-formula><mml:math id="ieqn3"><mml:mi>C</mml:mi></mml:math></inline-formula> is the number of classes, and <italic>N</italic> is the total number of instances in the dataset</p></sec><sec id="s2-3"><title>BioMedBERT</title><p>We chose BioMedBERT as the initial pretrained model due to its superior performance compared to other existing models on our gold standard data, publications published prior to 2021 that were identified as GRT, IRGT, and SWGRT papers using search queries. BioMedBERT is a pretrained language model developed by Microsoft Research for biomedical text processing using abstracts from PubMed and full-text articles from PubMedCentral [<xref ref-type="bibr" rid="ref22">22</xref>]. It is a specialized variant of the BERT architecture [<xref ref-type="bibr" rid="ref16">16</xref>], designed to capture domain-specific nuances in biomedical literature. The architecture of BioMedBERT enables it to learn contextualized representations of words and phrases bidirectionally and understand the contextual relationships within biomedical texts. The BERT model used in our study was downloaded from Hugging Face&#x2019;s Transformers library, configured as a classifier. In this framework, the tokenizer automatically manages special tokens such as [CLS] and [SEP], ensuring proper preprocessing for input sequences.</p><p>For the classification task, the [CLS] token's embedding from the final layer of BERT serves as a representation of the input sequence's contextual information. A multilayer perceptron is applied to this embedding to perform the classification. This multilayer perceptron consists of fully connected layers that map the [CLS] token's representation to the output space, followed by the softmax activation for probability distribution over classes.</p><p>The pretrained BioMedBERT uses a bidirectional transformer architecture with several layers of self-attention mechanisms. The model embeddings, including word embeddings and positional embeddings, contribute to encoding patterns in biomedical language.</p><p>During the pretraining phase, BioMedBERT was initialized with weights obtained from training on a domain-specific corpus: 14 million abstracts, 3.2 billion words, and 21 gigabytes [<xref ref-type="bibr" rid="ref22">22</xref>]. This large-scale training ensures that the model captures a wide range of biomedical concepts, terminology, and contextual relationships. BERT-BASE [<xref ref-type="bibr" rid="ref16">16</xref>] with 12 transformer layers and 100 million parameters was used to pretrain BioMedBERT.</p><p>A pooling layer was introduced atop the transformer&#x2019;s final layer, known as the embedding layer. This embedding layer underwent pooling to derive a fixed-size representation of the entire input sequence. To capture nonlinearity and intricate patterns within the data, a feedforward layer was incorporated. This layer is linked to the output layer, responsible for computing logits.</p></sec><sec id="s2-4"><title>Fine-Tuning BioMedBERT</title><p>For this study, we fine-tuned BioMedBERT to adapt the model to the specific nuances of our dataset related to publications from clinical trials that use nested designs. By leveraging the contextual information encoded in BioMedBERT, we aimed to enhance the accuracy and efficiency of our machine learning model in identifying and distinguishing various types of clinical trial publications. In particular, our goal was for BioMedBERT to serve as a multiclass classifier that can categorize biomedical publications into four distinct categories: GRT, IRGT, SWGRTs, and the broader category of publications based on studies that used other designs, which we refer to as negatives (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-5"><title>Data</title><p>The National Institute of Health Office of Disease Prevention provided a labeled dataset consisting of publications from PubMed, published prior to 2021, with each publication categorized into one of the three classes: GRT, IRGT, and SWGRT. We selected nonclinical trial publications from a list of 120 journals that published most of the nested clinical trial publications; we will refer to those publications as negative publications. The original dataset consisted of 891 GRT publications, 59 IRGT publications, 109 SWGRT publications, and 996 negative publications. The first version of the fine-tuned language model underwent training on titles and abstracts from this dataset. The best-performing model was subsequently employed to classify unlabeled publications published in 2021. After thorough verification of predictions by domain experts, a complementary set of 299 GRT, 40 IRGT, 65 SWGRT, and 1200 negative examples from 2021 was added to the original training, validation, and test sets.</p><p>The new dataset served as the foundation for tuning the model hyperparameters to predict labels for publications published in 2022. With the improved training data, the model demonstrated higher accuracy and F1-score, enabling precise classification of publications from the subsequent year, 2022. The same strategy as above was employed to add 2022 verified data to our training dataset, which resulted in the addition of 461 GRT, 195 IRGT, and 93 SWGRT, and 539 negative publications to the training, validation, and test datasets to prepare the classifier to predict labels for publications published in 2023. The Results section describes the final performance of the model trained and evaluated on this dataset.</p></sec><sec id="s2-6"><title>Tokenization</title><p>To process the textual data, we utilized the Hugging Face Trainer API in conjunction with the BioMedBERT tokenizer. The BioMedBERT tokenizer is trained on a corpus of biomedical text to tokenize and segment text into subwords using the WordPiece algorithm, just like the original BERT tokenizer. Therefore, the tokenizer is tailored to handle biomedical terminology and language patterns [<xref ref-type="bibr" rid="ref22">22</xref>]. For instance, when tokenizing the title &#x201C;Comparison of different intervention methods to reduce the incidence of venous thromboembolism: study protocol for a cluster-randomized, crossover trial&#x201D; from a 2023 publication, the tokens are segmented as follows: [&#x201C;comparison,&#x201D; &#x201C;of,&#x201D; &#x201C;different,&#x201D; &#x201C;intervention,&#x201D; &#x201C;methods,&#x201D; &#x201C;to,&#x201D; &#x201C;reduce,&#x201D; &#x201C;the,&#x201D; &#x201C;incidence,&#x201D; &#x201C;of,&#x2019; &#x201C;venous,&#x201D; &#x201C;thromboembolism,&#x201D; &#x201C;:,&#x201D; &#x201C;study,&#x201D; &#x201C;protocol,&#x201D; &#x201C;for,&#x201D; &#x201C;a,&#x201D; &#x201C;cluster,&#x201D; &#x201C;-,&#x201D; &#x201C;randomized,&#x201D; &#x201C;,&#x201D;, &#x201C;crossover,&#x201D; &#x201C;trial,&#x201D; &#x201C;.&#x201D;]. The tokenization process was carried out separately for the titles and abstracts of the publications. In order to maintain computational efficiency and manage memory constraints, we imposed a length limit on the tokenized text. Titles were truncated to a maximum length of 30 tokens, which was the maximum title length in the data, while abstracts were truncated to 256 tokens from the start of the text, ensuring that majority of abstracts in our dataset fit within this allocation. Shorter sequences were padded with zeros. Based on the length distribution of titles and abstracts, we adjusted the allocation to align with actual usage patterns, ensuring that all instances remain within the supported range while optimizing model efficiency.</p></sec><sec id="s2-7"><title>Addressing Class Imbalance</title><p>An imbalanced dataset refers to any dataset where there is an unequal distribution among classes, with one or more classes having significantly fewer instances than others. When trained on imbalanced datasets, models may exhibit a bias towards predicting the majority class more frequently, resulting in poor generalization for the minority class. Therefore, selecting the appropriate evaluation metric becomes crucial in such scenarios. A model might achieve high accuracy by predominantly predicting the majority class while displaying poor performance on the minority class. Various strategies exist to address class imbalance, including selecting appropriate performance metrics, such as precision-recall or the F<sub>1</sub>-score, to accurately reflect model performance. Techniques like undersampling the majority class, oversampling the minority class, employing synthetic data generation methods like SMOTE (synthetic minority over-sampling technique), leveraging algorithms designed to handle class imbalance robustly, and incorporating cost-sensitive learning by assigning costs to the loss function are among the approaches commonly employed [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. Given the observed class imbalance within the dataset, we implemented a customized loss function to mitigate the impact of this imbalance during the training phase. To achieve this, inverse class weights were calculated based on the number of examples in each class within the training dataset. These weights were then utilized in the weighted categorical cross-entropy loss function, assigning varying levels of importance to each class during model training.</p><p>The weighted categorical cross-entropy loss function is defined as follows:</p><disp-formula id="equWL6"><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Weighted Categorical Cross-Entropy Loss</mml:mtext><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><list list-type="simple"><list-item><p>where C is the number of classes, N is the number of samples in the dataset, <inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the true probability distribution (one-hot encoded vector) for class <inline-formula><mml:math id="ieqn5"><mml:mi>i</mml:mi></mml:math></inline-formula>, <inline-formula><mml:math id="ieqn6"><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the predicted probability distribution for class <inline-formula><mml:math id="ieqn7"><mml:mi>i</mml:mi></mml:math></inline-formula>, and  <inline-formula><mml:math id="ieqn8"><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the weight assigned to class <inline-formula><mml:math id="ieqn9"><mml:mi>i</mml:mi></mml:math></inline-formula></p></list-item></list><p>The inverse class frequency method is used to assign higher weights to classes with fewer examples. The rationale is to give higher weights to classes that are under-represented, making the model more sensitive to minority classes and potentially improving performance on imbalanced datasets.</p><disp-formula id="equWL7"><mml:math id="eqn7"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mtext>Total Number of Examples</mml:mtext><mml:mrow><mml:mtext>Number of Examples in class&#x00A0;</mml:mtext><mml:mi>i</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-8"><title>Hyperparameter Tuning</title><p>To evaluate the model&#x2019;s performance, we split our dataset into three subsets: a training set, a validation set, and a test set. The test set comprised 20% of the total dataset and was created by stratified random sampling from the labeled data. To assess the generalizability of our model to unseen data and to ensure a robust and unbiased evaluation of our model, a stratified k-fold cross-validation technique was used by splitting the remaining 80% of the dataset into subsets to train and validate the model iteratively, k times. In this technique, the dataset is divided into k folds while maintaining the same class distribution in each fold as the original dataset. We chose k=5 for a 5-fold cross-validation. Each fold maintained the proportion of class labels similar to that in the overall dataset. This helped prevent the model from being biased toward the majority class [<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>We conducted a series of experiments to determine the optimal hyperparameters for our model. These included exploring various values for learning rates, weight decay, batch sizes, and the number of training epochs [<xref ref-type="bibr" rid="ref22">22</xref>]. Our goal was to identify the combination of hyperparameters that produced the best model performance with the highest F1-score. The initial hyperparameters were selected based on the pretrained BioMedBERT model, and they were changed during the training process iteratively. The final model hyperparameters include a batch size of 32 and a learning rate of 0.00003098.</p><p>A regularization term is added to the loss function during training to penalize large weights and prevent overfitting [<xref ref-type="bibr" rid="ref29">29</xref>]. The modified loss function with weight decay is calculated as follows:</p><disp-formula id="equWL8"><mml:math id="eqn8"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Total Loss</mml:mtext><mml:mo>=</mml:mo><mml:mtext>Original Loss</mml:mtext><mml:mo>+</mml:mo><mml:mfrac><mml:mi>&#x03BB;</mml:mi><mml:mn>2</mml:mn></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:msup><mml:mrow><mml:mo symmetric="true">&#x2016;</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo symmetric="true">&#x2016;</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Where original loss is the loss without regularization,  <inline-formula><mml:math id="ieqn10"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula> is the weight decay hyperparameter, and  <inline-formula><mml:math id="ieqn11"><mml:msub><mml:mrow><mml:mi>&#x03A3;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mrow><mml:mo>&#x2016;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2016;</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> represents the sum of squared weights across all layers of the model.</p><p>The value of the weight decay hyperparameter is a crucial aspect of training. It determines the strength of the regularization effect. Too small a value may not prevent overfitting, while too large a value may penalize weights too much and stop the learning process. Moreover, early stopping was applied to the classifier head to prevent overfitting and improve efficiency with a patience of 5 epochs.</p><p>By following these steps, we developed a fine-tuned BioMedBERT model capable of classifying publications into the specified categories. This model was rigorously trained, validated, and optimized to maximize the classification accuracy and F<sub>1</sub>-score.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>To assess each model for generalizability, we evaluated its performance on the test set containing 20% of the final dataset that was randomly stratified and had not been introduced to the model previously.</p><sec id="s3-1"><title>Baseline Model Performance</title><p>After generating features using a FastText model and training the logistic regression model, we evaluated its performance across all classes. While the model demonstrated good performance on the majority class, indicating its efficacy in capturing prevalent patterns within the dataset, its performance on the minority class, IRGT, and SWGRT is suboptimal. <xref ref-type="table" rid="table1">Table 1</xref> presents the baseline model&#x2019;s performance across all classes, while <xref ref-type="table" rid="table2">Table 2</xref> depicts the confusion matrix of the baseline model. The weighted average for precision, recall or sensitivity, specificity, and F<sub>1</sub>-score is 0.85.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance metrics of logistic regression model across all classes.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Class</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall or sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom">F<sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">Negative</td><td align="left" valign="top" rowspan="4">0.85</td><td align="left" valign="top">0.87</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.90</td></tr><tr><td align="left" valign="top">GRT<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.82</td></tr><tr><td align="left" valign="top">IRGT<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">0.65</td><td align="left" valign="top">0.45</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.53</td></tr><tr><td align="left" valign="top">SWGRT<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">1</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.82</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>GRT: group- or cluster-randomized trial.</p></fn><fn id="table1fn2"><p><sup>b</sup>IGRT: individually randomized group-treatment.</p></fn><fn id="table1fn3"><p><sup>c</sup>SWGRT: stepped wedge group or cluster-randomized trial.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Confusion matrix.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Actual</td><td align="left" valign="bottom">Predicted</td><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/></tr><tr><td align="left" valign="bottom">Classes</td><td align="left" valign="bottom">Negative</td><td align="left" valign="bottom">GRT</td><td align="left" valign="bottom">IRGT</td><td align="left" valign="bottom">SWGRT</td></tr></thead><tbody><tr><td align="left" valign="top">Negative</td><td align="left" valign="top">508</td><td align="left" valign="top">27</td><td align="left" valign="top">12</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">GRT<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">60</td><td align="left" valign="top">269</td><td align="left" valign="top">2</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">IRGT<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">17</td><td align="left" valign="top">15</td><td align="left" valign="top">26</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">SWGRT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">2</td><td align="left" valign="top">14</td><td align="left" valign="top">0</td><td align="left" valign="top">37</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>GRT: group- or cluster-randomized trial.</p></fn><fn id="table2fn2"><p><sup>b</sup>IGRT: individually randomized group-treatment.</p></fn><fn id="table2fn3"><p><sup>c</sup>SWGRT: stepped wedge group or cluster-randomized trial.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Pretrained Versus Fine-Tuned Performance</title><p>The low performance of the pretrained BioMedBERT classifier on all classes justifies the fine-tuning of the transformer-based model. Following the fine-tuning process using a training set that was curated by domain experts iteratively, the model&#x2019;s performance exhibited noticeable enhancement in all performance metrics. The inclusion of domain expertise in the data curation process contributed to refining the model&#x2019;s understanding and, consequently, improving its predictive capabilities.</p><p><xref ref-type="table" rid="table3">Table 3</xref> shows the performance of the latest version of the fine-tuned model on our test set, which was not seen by the model during training and validation. The confusion matrix, depicted in <xref ref-type="table" rid="table4">Table 4</xref>, provides a breakdown of the model&#x2019;s predictions against the actual values. The weighted average for precision, recall or sensitivity, specificity, and F<sub>1</sub>-score are 0.94.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance metrics of fine-tuned BioMedBERT across all classes.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Class</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall or sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom">F<sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">Negative</td><td align="left" valign="top" rowspan="4">0.94</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.95</td></tr><tr><td align="left" valign="top">GRT<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.94</td></tr><tr><td align="left" valign="top">IRGT<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">0.69</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.75</td></tr><tr><td align="left" valign="top">SWGRT<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.96</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>GRT: group- or cluster-randomized trial.</p></fn><fn id="table3fn2"><p><sup>b</sup>IGRT: individually randomized group-treatment.</p></fn><fn id="table3fn3"><p><sup>c</sup>SWGRT: stepped wedge group or cluster-randomized trial.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Confusion matrix.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Actual</td><td align="left" valign="bottom">Predicted</td><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/></tr><tr><td align="left" valign="bottom">Classes</td><td align="left" valign="bottom">Negative</td><td align="left" valign="bottom">GRT</td><td align="left" valign="bottom">IRGT</td><td align="left" valign="bottom">SWGRT</td></tr></thead><tbody><tr><td align="left" valign="top">Negative</td><td align="left" valign="top">518</td><td align="left" valign="top">12</td><td align="left" valign="top">17</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">GRT<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">14</td><td align="left" valign="top">311</td><td align="left" valign="top">4</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="bottom">IRGT<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="bottom">8</td><td align="char" char="." valign="bottom">3</td><td align="char" char="." valign="bottom">47</td><td align="char" char="." valign="bottom">0</td></tr><tr><td align="left" valign="top">SWGRT<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0</td><td align="left" valign="top">2</td><td align="left" valign="top">0</td><td align="left" valign="top">51</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>GRT: group- or cluster-randomized trial.</p></fn><fn id="table4fn2"><p><sup>b</sup>IGRT: individually randomized group-treatment.</p></fn><fn id="table4fn3"><p><sup>c</sup>SWGRT: stepped wedge group or cluster-randomized trial.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>The model developed in this research used data provided by the Office of Disease Prevention at the National Institute of Health and leveraged a transformer-based pretrained language model, BioMedBERT, to identify publications of clinical trials that used one of three nested designs. The model outperformed the baseline model developed using features generated by BioWordVec and logistic regression. Compared to our baseline model, the recall and specificity for each class demonstrated improvements as follows: 2 points for recall and 3 points for specificity in non-randomized class; 13 points for recall and 0 points for specificity in GRT; 36 points for recall and 1 point for specificity in IRGT; 26 points for recall and 1 point for specificity in SWGRT. The IRGT class exhibited the lowest sensitivity compared to other classes, due to fewer examples in the training data and the inherent difficulty in identifying such publications by only processing titles and abstracts. Even for human curators, labeling these publications is challenging, often necessitating meticulous examination of the whole paper&#x2019;s content. Since the fine-tuned model was trained solely on titles and abstracts, it had limited information available for predicting IRGT publications.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>While machine learning and natural language processing techniques have been utilized to identify RCTs in the medical literature [<xref ref-type="bibr" rid="ref15">15</xref>], there have been fewer models specifically designed to identify special categories of group randomized clinical trials. Existing models for biomedical document classification predominantly rely on static embedding techniques, such as Word2Vec or FastText, although recent approaches have increasingly adopted nonstatic embedding methods, particularly transformer-based models like BERT and BioBERT, for more dynamic and context-aware text representations [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref32">32</xref>]. Our model leverages attention mechanisms and dynamic embedding techniques to capture the varying importance of words within the context of each document. By dynamically adjusting the embedding representations based on the context of the input sequence, our model can better capture the nuances and semantic relationships within the text, leading to improved classification performance.</p></sec><sec id="s4-3"><title>Future Work</title><p>Moving forward, future research endeavors may focus on refining the model to distinguish between subcategories within each main category, such as the method, protocol, and results [<xref ref-type="bibr" rid="ref33">33</xref>]. Tailoring the model to address these distinctions could further enhance its utility in biomedical text classification tasks, facilitating more precise and comprehensive literature analysis.</p></sec><sec id="s4-4"><title>Conclusions</title><p>Our study presents a robust framework leveraging transformer-based language models to effectively identify distinct categories of clinical trial publications within the biomedical literature. Through fine-tuning the pretrained BioMedBERT model, we achieved high accuracy and F<sub>1</sub>-score metrics across three categories: GRTs, IRGTs, and SWGRTs. The developed framework outperforms conventional search queries, providing advanced language understanding capabilities for discerning a broader spectrum of publications.</p><p>Our findings underscore the significance of transformer-based models in biomedical text classification, offering improved performance compared to traditional machine learning approaches and static embedding techniques. By continually updating and refining our model with new training data, we anticipate ongoing improvements in performance and adaptability over time. This iterative approach ensures the model remains up to date on the latest developments in the biomedical field, contributing to more efficient literature exploration, information retrieval, and knowledge discovery.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">GRT</term><def><p>group- or cluster-randomized trial</p></def></def-item><def-item><term id="abb2">IGRT</term><def><p>individually randomized group-treatment</p></def></def-item><def-item><term id="abb3">SMOTE</term><def><p>synthetic minority over-sampling technique</p></def></def-item><def-item><term id="abb4">SWGRT</term><def><p>stepped wedge group or cluster-randomized trial</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Murray</surname><given-names>DM</given-names> </name></person-group><article-title>Influential methods reports for group-randomized trials and related designs</article-title><source>Clin Trials</source><year>2022</year><month>08</month><volume>19</volume><issue>4</issue><fpage>353</fpage><lpage>362</lpage><pub-id pub-id-type="doi">10.1177/17407745211063423</pub-id><pub-id pub-id-type="medline">34991379</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Murray</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Taljaard</surname><given-names>M</given-names> </name><name name-style="western"><surname>Turner</surname><given-names>EL</given-names> </name><name name-style="western"><surname>George</surname><given-names>SM</given-names> </name></person-group><article-title>Essential ingredients and innovations in the design and analysis of group-randomized trials</article-title><source>Annu Rev Public Health</source><year>2020</year><month>04</month><day>2</day><volume>41</volume><issue>1-19</issue><fpage>1</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1146/annurev-publhealth-040119-094027</pub-id><pub-id pub-id-type="medline">31869281</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Campbell</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Walters</surname><given-names>SJ</given-names> </name></person-group><source>How to Design, Analyse and Report Cluster Randomised Trials in Medicine and Health Related Research</source><year>2014</year><publisher-name>John Wiley &#x0026; Sons Ltd</publisher-name><pub-id pub-id-type="doi">10.1002/9781118763452</pub-id><pub-id pub-id-type="other">9781119992028</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Donner</surname><given-names>A</given-names> </name><name name-style="western"><surname>Klar</surname><given-names>N</given-names> </name></person-group><source>Design and Analysis of Cluster Randomization Trials in Health Research</source><year>2000</year><publisher-name>Arnold</publisher-name><fpage>178</fpage><pub-id pub-id-type="other">0-34069153-0</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Eldridge</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kerry</surname><given-names>S</given-names> </name></person-group><article-title>A practical guide to cluster randomised trials in health services research</article-title><year>2012</year><comment><ext-link ext-link-type="uri" xlink:href="https://onlinelibrary.wiley.com/doi/book/10.1002/9781119966241">https://onlinelibrary.wiley.com/doi/book/10.1002/9781119966241</ext-link></comment><pub-id pub-id-type="doi">10.1002/9781119966241</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hayes</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Moulton</surname><given-names>LH</given-names> </name></person-group><source>Cluster Randomised Trials</source><year>2009</year><edition>1</edition><publisher-name>CRC Press</publisher-name><pub-id pub-id-type="doi">10.1201/9781584888178</pub-id><pub-id pub-id-type="other">9780429142055</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hayes</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Moulton</surname><given-names>LH</given-names> </name></person-group><source>Cluster Randomised Trials</source><year>2017</year><publisher-name>CRC Press</publisher-name><pub-id pub-id-type="other">9781315370286</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Murray</surname><given-names>DM</given-names> </name></person-group><source>Design and Analysis of Group-Randomized Trials</source><year>1998</year><publisher-name>Oxford University Press</publisher-name><fpage>467</fpage><pub-id pub-id-type="other">0-19-512036-1</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hussey</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Hughes</surname><given-names>JP</given-names> </name></person-group><article-title>Design and analysis of stepped wedge cluster randomized trials</article-title><source>Contemp Clin Trials</source><year>2007</year><month>02</month><volume>28</volume><issue>2</issue><fpage>182</fpage><lpage>191</lpage><pub-id pub-id-type="doi">10.1016/j.cct.2006.05.007</pub-id><pub-id pub-id-type="medline">16829207</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pals</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Murray</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Alfano</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Shadish</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Hannan</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Baker</surname><given-names>WL</given-names> </name></person-group><article-title>Individually randomized group treatment trials: a critical appraisal of frequently used design and analytic approaches</article-title><source>Am J Public Health</source><year>2008</year><month>08</month><volume>98</volume><issue>8</issue><fpage>1418</fpage><lpage>1424</lpage><pub-id pub-id-type="doi">10.2105/AJPH.2007.127027</pub-id><pub-id pub-id-type="medline">18556603</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moyer</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Li</surname><given-names>F</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>AJ</given-names> </name><etal/></person-group><article-title>Evaluating analytic models for individually randomized group treatment trials with complex clustering in nested and crossed designs</article-title><source>Stat Med</source><year>2024</year><month>11</month><day>10</day><volume>43</volume><issue>25</issue><fpage>4796</fpage><lpage>4818</lpage><pub-id pub-id-type="doi">10.1002/sim.10206</pub-id><pub-id pub-id-type="medline">39225281</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Antonio Mouri&#x00F1;o Garc&#x00ED;a</surname><given-names>M</given-names> </name><name name-style="western"><surname>P&#x00E9;rez Rodr&#x00ED;guez</surname><given-names>R</given-names> </name><name name-style="western"><surname>Anido Rif&#x00F3;n</surname><given-names>L</given-names> </name></person-group><article-title>Leveraging Wikipedia knowledge to classify multilingual biomedical documents</article-title><source>Artif Intell Med</source><year>2018</year><month>06</month><volume>88</volume><fpage>37</fpage><lpage>57</lpage><pub-id pub-id-type="doi">10.1016/j.artmed.2018.04.007</pub-id><pub-id pub-id-type="medline">29730047</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>AM</given-names> </name></person-group><article-title>An effective general purpose approach for automated biomedical document classification</article-title><source>AMIA Annu Symp Proc</source><year>2006</year><volume>2006</volume><issue>161-5</issue><fpage>161</fpage><lpage>165</lpage><pub-id pub-id-type="medline">17238323</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>Noel-Storr</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kuiper</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>BC</given-names> </name></person-group><article-title>Machine learning for identifying randomized controlled trials: an evaluation and practitioner&#x2019;s guide</article-title><source>Res Synth Methods</source><year>2018</year><month>12</month><volume>9</volume><issue>4</issue><fpage>602</fpage><lpage>614</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1287</pub-id><pub-id pub-id-type="medline">29314757</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al-Jaishi</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Taljaard</surname><given-names>M</given-names> </name><name name-style="western"><surname>Al-Jaishi</surname><given-names>MD</given-names> </name><etal/></person-group><article-title>Machine learning algorithms to identify cluster randomized trials from MEDLINE and EMBASE</article-title><source>Syst Rev</source><year>2022</year><month>10</month><day>25</day><volume>11</volume><issue>1</issue><fpage>229</fpage><pub-id pub-id-type="doi">10.1186/s13643-022-02082-4</pub-id><pub-id pub-id-type="medline">36284336</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><access-date>2025-04-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1810.04805">https://arxiv.org/abs/1810.04805</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yag</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Carbonell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Salakhutdinov</surname><given-names>R</given-names> </name><name name-style="western"><surname>Le</surname><given-names>Q</given-names> </name></person-group><article-title>XLNet: generalized autoregressive pretraining for language understanding</article-title><source>arXiv</source><access-date>2025-04-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1906.08237">https://arxiv.org/abs/1906.08237</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.1906.08237</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><etal/></person-group><article-title>RoBERTa: A robustly optimized BERT pretraining approach</article-title><source>arXiv</source><access-date>2025-04-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1907.11692">https://arxiv.org/abs/1907.11692</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yasunaga</surname><given-names>M</given-names> </name><name name-style="western"><surname>Leskovec</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name></person-group><article-title>LinkBERT: pretraining language models with document links</article-title><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.acl-long">https://aclanthology.org/2022.acl-long</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.551</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name></person-group><article-title>Transfer learning in biomedical natural language processing: an evaluation of BERT and elmo on ten benchmarking datasets</article-title><comment>Preprint posted online on 2019</comment><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/W19-50">https://www.aclweb.org/anthology/W19-50</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/W19-5006</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tinn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title><source>ACM Trans Comput Healthcare</source><year>2022</year><month>01</month><day>31</day><volume>3</volume><issue>1</issue><fpage>1</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1145/3458754</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>K</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sanghvi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>M</given-names> </name></person-group><article-title>A comparative analysis of logistic regression, random forest and KNN models for the text classification</article-title><source>Augment Hum Res</source><year>2020</year><month>12</month><volume>5</volume><issue>1</issue><fpage>12</fpage><pub-id pub-id-type="doi">10.1007/s41133-020-00032-0</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name></person-group><article-title>BioWordVec, improving biomedical word embeddings with subword information and MeSH</article-title><source>Sci Data</source><year>2019</year><month>05</month><day>10</day><volume>6</volume><issue>1</issue><fpage>52</fpage><pub-id pub-id-type="doi">10.1038/s41597-019-0055-0</pub-id><pub-id pub-id-type="medline">31076572</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sokolova</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lapalme</surname><given-names>G</given-names> </name></person-group><article-title>A systematic analysis of performance measures for classification tasks</article-title><source>Inf Process Manag</source><year>2009</year><month>07</month><volume>45</volume><issue>4</issue><fpage>427</fpage><lpage>437</lpage><pub-id pub-id-type="doi">10.1016/j.ipm.2009.03.002</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Christen</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hand</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Kirielle</surname><given-names>N</given-names> </name></person-group><article-title>A review of the f-measure: its history, properties, criticism, and alternatives</article-title><source>ACM Comput Surv</source><year>2024</year><month>03</month><day>31</day><volume>56</volume><issue>3</issue><fpage>1</fpage><lpage>24</lpage><pub-id pub-id-type="doi">10.1145/3606367</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kamel</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>AKC</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>Cost-sensitive boosting for classification of imbalanced data</article-title><source>Pattern Recognit DAGM</source><year>2007</year><month>12</month><volume>40</volume><issue>12</issue><fpage>3358</fpage><lpage>3378</lpage><pub-id pub-id-type="doi">10.1016/j.patcog.2007.04.009</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chawla</surname><given-names>NV</given-names> </name><name name-style="western"><surname>Bowyer</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>LO</given-names> </name><name name-style="western"><surname>Kegelmeyer</surname><given-names>WP</given-names> </name></person-group><article-title>SMOTE: synthetic minority over-sampling technique</article-title><source>jair</source><year>2002</year><volume>16</volume><fpage>321</fpage><lpage>357</lpage><pub-id pub-id-type="doi">10.1613/jair.953</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hastie</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tibshirani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Friedman</surname><given-names>J</given-names> </name></person-group><source>The Elements of Statistical Learning: Data Mining, Inference, and Prediction</source><year>2009</year><edition>2</edition><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-0-387-84858-7</pub-id><pub-id pub-id-type="other">978-1-4899-0519-2</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Utilizing image and caption information for biomedical document classification</article-title><source>Bioinformatics</source><year>2021</year><month>07</month><day>12</day><volume>37</volume><issue>Suppl_1</issue><fpage>i468</fpage><lpage>i476</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btab331</pub-id><pub-id pub-id-type="medline">34252939</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kesiku</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Chaves-Villota</surname><given-names>A</given-names> </name><name name-style="western"><surname>Garcia-Zapirain</surname><given-names>B</given-names> </name></person-group><article-title>Natural language processing techniques for text classification of biomedical documents: a systematic review</article-title><source>Information</source><year>2022</year><volume>13</volume><issue>10</issue><fpage>499</fpage><pub-id pub-id-type="doi">10.3390/info13100499</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Syriopoulos</surname><given-names>PG</given-names> </name><name name-style="western"><surname>Andriopoulos</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Koutsomitropoulos</surname><given-names>DA</given-names> </name></person-group><article-title>Evaluation of language models for multilabel classification of biomedical texts</article-title><conf-name>Artificial Intelligence Applications and Innovations</conf-name><conf-date>Jun 21, 2024</conf-date><fpage>68</fpage><lpage>78</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-63211-2_6</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walwyn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>C</given-names> </name></person-group><article-title>Therapist variation within randomised trials of psychotherapy: implications for precision, internal and external validity</article-title><source>Stat Methods Med Res</source><year>2010</year><month>06</month><volume>19</volume><issue>3</issue><fpage>291</fpage><lpage>315</lpage><pub-id pub-id-type="doi">10.1177/0962280209105017</pub-id><pub-id pub-id-type="medline">19608603</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Python code</p><media xlink:href="medinform_v13i1e63267_app1.docx" xlink:title="DOCX File, 26 KB"/></supplementary-material></app-group></back></article>