<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e54653</article-id><article-id pub-id-type="doi">10.2196/54653</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Accelerating Evidence Synthesis in Observational Studies: Development of a Living Natural Language Processing&#x2013;Assisted Intelligent Systematic Literature Review System</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Manion</surname><given-names>Frank J</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Du</surname><given-names>Jingcheng</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wang</surname><given-names>Dong</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>He</surname><given-names>Long</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lin</surname><given-names>Bin</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Jingqi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Siwei</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Eckels</surname><given-names>David</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cervenka</surname><given-names>Jan</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fiduccia</surname><given-names>Peter C</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cossrow</surname><given-names>Nicole</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yao</surname><given-names>Lixia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>IMO Health</institution>, <addr-line>9600 W Bryn Mawr Ave # 100</addr-line><addr-line>Rosemont</addr-line><addr-line>IL</addr-line>, <country>United States</country></aff><aff id="aff2"><institution>Merck &#x0026; Co, Inc</institution>, <addr-line>126 East Lincoln Ave</addr-line><addr-line>Rahway</addr-line><addr-line>NJ</addr-line>, <country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Perrin</surname><given-names>Caroline</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Matsuda</surname><given-names>Shinichi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhou</surname><given-names>Sicheng</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Dong Wang, PhD, Merck &#x0026; Co, Inc, 126 East Lincoln Ave., Rahway, NJ, United States, 1 619-643-2693; <email>dong.wang10@merck.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>23</day><month>10</month><year>2024</year></pub-date><volume>12</volume><elocation-id>e54653</elocation-id><history><date date-type="received"><day>17</day><month>11</month><year>2023</year></date><date date-type="rev-recd"><day>24</day><month>04</month><year>2024</year></date><date date-type="accepted"><day>23</day><month>07</month><year>2024</year></date></history><copyright-statement>&#x00A9; Frank J Manion, Jingcheng Du, Dong Wang, Long He, Bin Lin, Jingqi Wang, Siwei Wang, David Eckels, Jan Cervenka, Peter C Fiduccia, Nicole Cossrow, Lixia Yao. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 23.10.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2024/1/e54653"/><abstract><sec><title>Background</title><p>Systematic literature review (SLR), a robust method to identify and summarize evidence from published sources, is considered to be a complex, time-consuming, labor-intensive, and expensive task.</p></sec><sec><title>Objective</title><p>This study aimed to present a solution based on natural language processing (NLP) that accelerates and streamlines the SLR process for observational studies using real-world data.</p></sec><sec sec-type="methods"><title>Methods</title><p>We followed an agile software development and iterative software engineering methodology to build a customized intelligent end-to-end living NLP-assisted solution for observational SLR tasks. Multiple machine learning&#x2013;based NLP algorithms were adopted to automate article screening and data element extraction processes. The NLP prediction results can be further reviewed and verified by domain experts, following the human-in-the-loop design. The system integrates explainable articificial intelligence to provide evidence for NLP algorithms and add transparency to extracted literature data elements. The system was developed based on 3 existing SLR projects of observational studies, including the epidemiology studies of human papillomavirus&#x2013;associated diseases, the disease burden of pneumococcal diseases, and cost-effectiveness studies on pneumococcal vaccines.</p></sec><sec sec-type="results"><title>Results</title><p>Our Intelligent SLR Platform covers major SLR steps, including study protocol setting, literature retrieval, abstract screening, full-text screening, data element extraction from full-text articles, results summary, and data visualization. The NLP algorithms achieved accuracy scores of 0.86-0.90 on article screening tasks (framed as text classification tasks) and macroaverage F1 scores of 0.57-0.89 on data element extraction tasks (framed as named entity recognition tasks).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Cutting-edge NLP algorithms expedite SLR for observational studies, thus allowing scientists to have more time to focus on the quality of data and the synthesis of evidence in observational studies. Aligning the living SLR concept, the system has the potential to update literature data and enable scientists to easily stay current with the literature related to observational studies prospectively and continuously.</p></sec></abstract><kwd-group><kwd>machine learning</kwd><kwd>deep learning</kwd><kwd>natural language processing</kwd><kwd>systematic literature review</kwd><kwd>artificial intelligence</kwd><kwd>software development</kwd><kwd>data extraction</kwd><kwd>epidemiology</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Systematic literature reviews (SLRs) are widely recognized as a robust method to identify and summarize evidence from published sources [<xref ref-type="bibr" rid="ref1">1</xref>]. However, conducting an SLR can be a complex, time-consuming, labor-intensive, and expensive task, depending on the breadth of the topic, level of granularity, or resolution of the review needed [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. One recent study estimated the time and cost required to conduct an SLR can be as high as 1.72 person-years of scientist effort and approximately $140,000 per review [<xref ref-type="bibr" rid="ref4">4</xref>]. Because SLRs are so resource intensive, it is difficult to stay up to date, and once an SLR is complete and new literature is published, the SLR may become incomplete and obsolete as time goes by.</p><p>Natural language processing (NLP) refers to artificial intelligence (AI) technologies that can extract structured information from textual documents such as medical charts, lab results, and many other types of unstructured text. NLP has significantly advanced a variety of biomedical applications in recent years. There is considerable community interest in using AI such as machine learning (ML) and NLP to improve automation in aspects of literature reviews [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. For example, Thomas et al used NLP to identify randomized controlled trials for Cochrane reviews, and Wallace et al developed methods to extract sentences from literature related to clinical trial reports. There are also some SLR management software, such as Raynan.ai [<xref ref-type="bibr" rid="ref8">8</xref>], which leverages NLP to expedite certain SLR steps, including article screening.</p><p>Despite these existing efforts, there is a lack of systematic and integrated NLP solutions for SLR to cover its full aspects, preventing the wide adoption of such tools in SLR projects.</p><p>Thus, in this study, we evaluated an intelligent SLR system (hereinafter referred to as ISLR) for observational SLR tasks. The use of NLP improves efficiency, while the human-in-the-loop approach improves accuracy and reduces errors. The system uses cutting-edge NLP tools that employ ML and deep learning (DL) approaches to expedite the time-consuming processes involved in an SLR by making a series of learned recommendations to the end user. The purpose of this study is to evaluate an AI tool that accelerates and streamlines the SLR process and to demonstrate the validity of this tool in 3 use cases.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Workflow and System Architecture</title><p>ISLR has 2 major views that target 2 types of users in the observational studies in an SLR lifecycle: (1) an intelligent SLR workbench for literature reviewers who conduct routine literature reviews, and (2) a living literature data dashboard for researchers and analysts who focus on analyzing SLR data and keep up to date on new evidence. <xref ref-type="fig" rid="figure1">Figure 1</xref> shows the overview architecture, including the 2 major views and data flow of the SLR system. ISLR integrates AI technologies and an SLR workflow management system to support literature collection, screening, and data extraction. The living literature dashboard continuously searches and updates the SLR, allowing users to interactively navigate the updated literature and develop new insights.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overall data flow architecture of ISLR demonstrating the 2 major views. AI: artificial intelligence; ISLR: intelligent systematic literature review; SLR: systematic literature review.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e54653_fig01.png"/></fig><p>Reliable NLP systems depend heavily on the development of a reasonable workflow, user interfaces, and high-performance NLP algorithms. To develop the system and define the system workflow and user interfaces, we collaborated with end users who are experts in SLR using an iterative approach that employed industry-standard agile methodology. The team identified 6 major functional areas that were essential for the application: (1) protocol specification assistance, (2) literature search and indexing, (3) abstract screening with NLP assistance, (4) support for full-text searching, uploading, and screening, (5) full-text data element extraction using NLP assistance to identify and extract relevant data elements from full-text and embedded tables, and (6) literature data visualization to enable users to assess the SLR results and perform data discovery. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the system workflow and the embedded NLP services to expedite two of the most time-consuming steps, which are article screening and data element extraction.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>ISLR workflow and embedded NLP engines. ISLR: intelligent systematic literature review; NLP: natural language processing.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e54653_fig02.png"/></fig></sec><sec id="s2-2"><title>Development and Validation of NLP Algorithms</title><p>As mentioned earlier, 2 sets of NLP algorithms are required for a specific SLR project, including abstract screening and full-text data element extraction. <xref ref-type="fig" rid="figure3">Figure 3</xref> outlines the NLP algorithm development process for these 2 steps separately. For abstract screening, the first step is to annotate and build a corpus that includes the abstract text, citation metadata, and inclusion/exclusion status. Once the corpus is prepared, NLP algorithm training, evaluation, and selection can be performed, and the best-performing algorithms will be chosen for deployment.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>SLR NLP algorithm development steps. NLP: natural language processing; SLR: systematic literature review.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e54653_fig03.png"/></fig><p>Similar to abstract screening, the NLP algorithm for the full-text data element extraction also requires a complete NLP development lifecycle. Unlike abstract screening, where labeled corpora may be available from previous SLR projects, data annotation is required to curate a labeled data set for training and evaluating NLP algorithms. The best-performing algorithms will be selected for deployment after evaluation. The following figure describes details on NLP algorithm development and validation process for SLR projects.</p><p>Three previously completed SLRs were used to guide and validate NLP development. These 3 projects included: (1) the prevalence of human papillomavirus (HPV) detected in head and neck squamous cell carcinomas (referred to as <italic>HPV Prevalence</italic>); (2) the epidemiology of the pneumococcal disease (referred to as <italic>Pneumococcal Epidemiology</italic>), and (3) the economic burden of pneumococcal disease (referred to as <italic>Pneumococcal Economic Burden</italic>). The inclusion and exclusion criteria for these 3 SLRs can be found in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><sec id="s2-2-1"><title>Developing the Abstract Screening Corpora</title><p>Abstract screening was treated as a binary document classification task, ie, inclusion or exclusion of the article based on the abstract. Consequently, it was necessary to select and train NLP models for the task that demonstrated adequate performance and that had a reasonable computational time. The annotated screening literature sets from the 3 previous SLRs were used as the gold standard to train and validate models, including 1697, 207, and 421 articles for <italic>HPV Epidemiology</italic>, <italic>Pneumococcal Epidemiology,</italic> and <italic>Pneumococcal Economic Burden</italic>, respectively. The corpora contained citation metadata, including title, authors, Medical Subject Heading terms [<xref ref-type="bibr" rid="ref9">9</xref>], and the text of the corresponding abstracts.</p></sec><sec id="s2-2-2"><title>Developing the Full-Text Data Element Extraction Corpora</title><p>We selected 190, 25, and 24 full-text articles for <italic>HPV Prevalence</italic>, <italic>Pneumococcal Epidemiology,</italic> and <italic>Pneumococcal Economic Burden</italic> for annotation, respectively. Based on the key outcome variables defined in the 3 SLRs, we annotated 12 types of data elements, covering information related to general observational studies, such as <italic>Study Population,</italic> to disease-specific information such as <italic>HPV Lab Technique</italic> and <italic>Pneumococcal Disease Type</italic>.</p></sec><sec id="s2-2-3"><title>Abstract Screening NLP Algorithms</title><p>For abstract screening, the NLP model classifies each article for its relevance based on its title, abstract, and other citation meta data. To build the abstract screening module, we evaluated 4 traditional ML-based document classification algorithms, XGBoost [<xref ref-type="bibr" rid="ref10">10</xref>], support vector machines [<xref ref-type="bibr" rid="ref11">11</xref>], logistic regression [<xref ref-type="bibr" rid="ref12">12</xref>], and random forest [<xref ref-type="bibr" rid="ref13">13</xref>] on the binary inclusion/exclusion classification task for abstract screening. The abstract screening corpora were used to evaluate NLP models by calculating standard metric of <italic>precision (fraction of relevant instances among the retrieved instances, also called positive predictive value</italic>), <italic>recall (fraction of relevant instances that were retrieved, also called sensitivity</italic>), <italic>accuracy</italic>, and <italic>F1 scores</italic> (the harmonic mean of precision and recall). The full features include title, abstract, authors, keywords, journal, Medical Subject Heading term, and publication types. We concatenated all features and extracted the term frequency-inverse document frequency vector as feature representation.</p></sec><sec id="s2-2-4"><title>Data Element Extraction NLP Algorithms</title><p>To construct the module for data element extraction, we treated the problem of data element recognition and extraction as a named entity recognition (NER) problem, which aims to recognize the mentions of entities from the text [<xref ref-type="bibr" rid="ref14">14</xref>]. We evaluated a series of NLP algorithms consisting of ML and DL algorithms to recognize and extract data elements from full text, including (1) conditional random fields (CRFs), a classic statistical sequence modeling algorithm that has been widely applied to NER tasks [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]; (2) long short-term memory (LSTM), a variation of recurrent neural networks that has achieved remarkable success in NER tasks [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]; and (3) &#x201C;Clinical BERT (Bidirectional Encoder Representations from Transformers)&#x201D; [<xref ref-type="bibr" rid="ref19">19</xref>], a novel transformer-based DL model. Standard metrics, including <italic>precision</italic>, <italic>recall</italic>, <italic>accuracy</italic>, and <italic>F1 scores</italic>, were calculated.</p></sec></sec><sec id="s2-3"><title>Ethical Considerations</title><p>This is not applicable as this study is not human subjects research.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Here, we report the results of the construction of the annotation corpora and the results of the NLP algorithm for abstract screening and data element extraction, respectively.</p><sec id="s3-1"><title>Abstract Screening Corpora Description</title><p>The <italic>HPV Prevalence</italic> corpus we constructed from the existing SLR project contained 1697 total citations, of which 538 were included, and 1159 were excluded due to study criteria. The constructed <italic>Pneumococcal Epidemiology</italic> contained 207 citations, of which 85 were included and 122 were excluded. The constructed <italic>Pneumococcal Economic Burden</italic> corpus contained 421 citations, of which 79 were included, and 342 were excluded.</p></sec><sec id="s3-2"><title>Abstract Screening NLP Evaluation Results</title><p>Extensive studies have shown the superiority of transformer-based DL models for many NLP tasks [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Based on our experiments, however, adding features to the pretrained language models did not significantly boost their performance. The performance comparison results for each task are shown in <xref ref-type="table" rid="table1">Table 1</xref>. XGBoost achieved the highest accuracy on <italic>HPV Prevalence</italic> and <italic>Pneumococcal Economic Burden</italic> tasks, while a support vector machine achieved the highest accuracy on <italic>Pneumococcal Epidemiology</italic> task. XGBoost was ultimately chosen for deployment due to its better generalizability.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of article screening natural language processing model performance.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Task and algorithm</td><td align="left" valign="bottom">F1 score</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom">Accuracy</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2"><italic><bold>HPV Prevalence</bold></italic> <bold>(n=1697)</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">XGBoost</td><td align="left" valign="top">0.808</td><td align="left" valign="top">0.769</td><td align="left" valign="top">0.851</td><td align="left" valign="top">0.888</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Support vector machine</td><td align="left" valign="top">0.727</td><td align="left" valign="top">0.781</td><td align="left" valign="top">0.681</td><td align="left" valign="top">0.859</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Logistics regression</td><td align="left" valign="top">0.684</td><td align="left" valign="top">0.897</td><td align="left" valign="top">0.553</td><td align="left" valign="top">0.859</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Random forest</td><td align="left" valign="top">0.523</td><td align="left" valign="top">0.944</td><td align="left" valign="top">0.362</td><td align="left" valign="top">0.818</td></tr><tr><td align="left" valign="top" colspan="2"><italic><bold>Pneumococcal Economic Burden</bold></italic> <bold>(n=421)</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">XGBoost</td><td align="left" valign="top">0.750</td><td align="left" valign="top">0.857</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.907</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Support vector machine</td><td align="left" valign="top">0.533</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.444</td><td align="left" valign="top">0.667</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Logistics regression</td><td align="left" valign="top">0.333</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.222</td><td align="left" valign="top">0.831</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Random forest</td><td align="left" valign="top">0.429</td><td align="left" valign="top">0.600</td><td align="left" valign="top">0.333</td><td align="left" valign="top">0.814</td></tr><tr><td align="left" valign="top" colspan="2"><italic><bold>Pneumococcal Epidemiology</bold></italic> <bold>(n=207)</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">XGBoost</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.533</td><td align="left" valign="top">0.889</td><td align="left" valign="top">0.619</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Support vector machine</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.861</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Logistics regression</td><td align="left" valign="top">0.429</td><td align="left" valign="top">0.600</td><td align="left" valign="top">0.333</td><td align="left" valign="top">0.619</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Random forest</td><td align="left" valign="top">0.615</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.444</td><td align="left" valign="top">0.762</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Full-Text Data Element Extraction Corpora Description</title><p>The human annotators annotated 190, 25, and 24 full-text articles for the <italic>HPV Prevalence</italic>, <italic>Pneumococcal Epidemiology</italic>, and <italic>Pneumococcal Economic Burden</italic> tasks, respectively. Among these full-text articles, 4498, 579, and 252 entity mentions were annotated for 3 projects, respectively. However, the distribution of annotated entities is highly imbalanced. For example, data elements like <italic>HPV Lab Technique</italic> and <italic>HPV Sample Type</italic> were very prevalent, but data elements like <italic>Maximum/Minimum Age in Study Cohort</italic> were rarely annotated in the corpora.</p></sec><sec id="s3-4"><title>Results of the Full-Text Screening and Data Element Extraction NLP Methods</title><p><xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref> show the comparison of NLP performance among CRFs, LSTM, and BERT on the 3 corpora. For each of the 3 corpora used to train the NLP models, LSTM demonstrated superiority over the conventional ML algorithm (ie, CRF) on entity recognition. Among DL models, we did not observe significant improvement in F1 scores by use of the BERT model. The BERT model achieved similar or worse performance on most data elements. The performance across different tasks varies, primarily due to the availability of annotated data. For example, on average, models&#x2019; performance on <italic>HPV Prevalence</italic> is higher than <italic>Pneumococcal Epidemiology</italic> and <italic>Pneumococcal Economic Burden</italic>, as <italic>HPV Prevalence</italic> has the largest annotated data. Due to the highly imbalanced distribution of annotated entities, we observe a variation in performance across different data elements for the same model. For example, in the <italic>Pneumococcal Epidemiology</italic> task, the LSTM model has achieved 0.412 in the identification of the <italic>Study Cohort</italic> and 0.768 in the identification of the <italic>Pneumococcal Disease Type</italic>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Overall performance comparison for the named entity recognition task in the 3 natural language processing training corpora. Scores averaged across all 12 extracted data elements. Measured in lenient F1 score.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Measure</td><td align="left" valign="bottom" colspan="3"><italic>HPV Prevalence</italic></td><td align="left" valign="bottom" colspan="3"><italic>Pneumococcal Epidemiology</italic></td><td align="left" valign="bottom" colspan="3"><italic>Pneumococcal Economic Burden</italic></td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">CRF<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">LSTM<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom">Clinical BERT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="bottom">CRF</td><td align="left" valign="bottom">LSTM</td><td align="left" valign="bottom">Clinical BERT</td><td align="left" valign="bottom">CRF</td><td align="left" valign="bottom">LSTM</td><td align="left" valign="bottom">Clinical BERT</td></tr></thead><tbody><tr><td align="left" valign="top">Microaverage (global average that uses the total number of true positives, false positives, and false negatives)</td><td align="left" valign="top">0.856</td><td align="left" valign="top">0.890</td><td align="left" valign="top">0.782</td><td align="left" valign="top">0.571</td><td align="left" valign="top">0.646</td><td align="left" valign="top">0.444</td><td align="left" valign="top">0.609</td><td align="left" valign="top">0.615</td><td align="left" valign="top">0.478</td></tr><tr><td align="left" valign="top">Macroaverage score (arithmetic mean of all the per-entity type scores)</td><td align="left" valign="top">0.522</td><td align="left" valign="top">0.674</td><td align="left" valign="top">0.685</td><td align="left" valign="top">0.270</td><td align="left" valign="top">0.295</td><td align="left" valign="top">0.227</td><td align="left" valign="top">0.216</td><td align="left" valign="top">0.238</td><td align="left" valign="top">0.231</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>CRF: conditional random field.</p></fn><fn id="table2fn2"><p><sup>b</sup>LSTM: long short-term memory.</p></fn><fn id="table2fn3"><p><sup>c</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance comparison for the named entity recognition task on selected data elements. Measured in lenient F1 score.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Measure</td><td align="left" valign="bottom" colspan="3"><italic>HPV Prevalence</italic></td><td align="left" valign="bottom" colspan="3"><italic>Pneumococcal Epidemiology</italic></td><td align="left" valign="bottom" colspan="3"><italic>Pneumococcal Economic Burden</italic></td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">CRF<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">LSTM<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">Clinical BERT<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="bottom">CRF</td><td align="left" valign="bottom">LSTM</td><td align="left" valign="bottom">Clinical BERT</td><td align="left" valign="bottom">CRF</td><td align="left" valign="bottom">LSTM</td><td align="left" valign="bottom">Clinical BERT</td></tr></thead><tbody><tr><td align="left" valign="top"><italic>Study Cohort</italic></td><td align="left" valign="top">0.482</td><td align="left" valign="top">0.695</td><td align="left" valign="top">0.727</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">0.412</td><td align="left" valign="top">0.278</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><italic>Study Location</italic></td><td align="left" valign="top">0.434</td><td align="left" valign="top">0.520</td><td align="left" valign="top">0.574</td><td align="left" valign="top">0.514</td><td align="left" valign="top">0.508</td><td align="left" valign="top">0.546</td><td align="left" valign="top">0.586</td><td align="left" valign="top">0.484</td><td align="left" valign="top">0.497</td></tr><tr><td align="left" valign="top"><italic>Study Type</italic></td><td align="left" valign="top">0.733</td><td align="left" valign="top">0.760</td><td align="left" valign="top">0.753</td><td align="left" valign="top">0.364</td><td align="left" valign="top">0.525</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.328</td><td align="left" valign="top">0.299</td></tr><tr><td align="left" valign="top"><italic>Pneumococcal Disease Type</italic></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.725</td><td align="left" valign="top">0.768</td><td align="left" valign="top">0.526</td><td align="left" valign="top">0.644</td><td align="left" valign="top">0.715</td><td align="left" valign="top">0.523</td></tr><tr><td align="left" valign="top"><italic>Incidence or Prevalence</italic></td><td align="left" valign="top">0.986</td><td align="left" valign="top">0.983</td><td align="left" valign="top">0.924</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><italic>Study Time</italic></td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.888</td><td align="left" valign="top">0.930</td><td align="left" valign="top">0.222</td><td align="left" valign="top">0.636</td><td align="left" valign="top">0.328</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>CRF: conditional random field.</p></fn><fn id="table3fn2"><p><sup>b</sup>LSTM: long short-term memory.</p></fn><fn id="table3fn3"><p><sup>c</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table3fn4"><p><sup>d</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Final NLP Algorithm Selection</title><p>NLP algorithms were needed for the 2 tasks, abstract screening, and data element extraction, in the ISLR system. The abstract screening was treated as a classification task. Based on our experimental results, XGBoost was selected for this task due to good performance on our document classification experiments and less computational complexity than DL-based models. For the data element extraction task, LSTM was selected over CRF and BERT for the same reasons.</p></sec><sec id="s3-6"><title>ISLR System Components</title><sec id="s3-6-1"><title>Study Protocol Specification</title><p>Study protocol specification is one of the first steps in an SLR project. Users can upload a PDF document to the system that describes the SLR study protocol for reference. The SLR system has a default list of data elements with their descriptions and answer types (eg, free text, multiple choice, and checkbox), which will be extracted from full-text PDFs of articles. The system also allows users to create and modify the list. At the end of the project, all the extracted data elements can be exported in a structured format.</p></sec><sec id="s3-6-2"><title>Literature Search</title><p>The ISLR system is integrated with the PubMed E-utilities application programming interface, which enables users to perform direct searches on PubMed. Citation metadata such as abstracts, titles, journals, and authors can be retrieved from PubMed and indexed in the system for further screening and data element extraction. Additionally, the system provides an option for users to retrieve this citation metadata by uploading a list of individual PubMed IDs.</p></sec><sec id="s3-6-3"><title>Abstract Screening</title><p>The purpose of abstract screening is to review collected articles&#x2019; relevance based on their title, abstract, and other relevant metadata, such as journal names, article types, and keywords. The relevant articles will be included for the following full-text screening and data element extraction steps. NLP services are provided at this step to make recommendations on whether a particular article should be included for full-text review. The supporting information (eg, salient words that are impactful to inclusion and exclusion) for the NLP recommendation will also be shown to provide explainable evidence. Human experts can further review the predictions for each article and decide on abstract screening status (keep or exclude). <xref ref-type="fig" rid="figure4">Figure 4</xref> shows the abstract screening interface demonstrating prediction results and relevant terms discovered by the NLP algorithms.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Abstract screening interface. Terms that support inclusion in the finalized cohort of relevant articles are shown in green, while terms that detract from inclusion are shown in red. The scale of the colors shows how significantly one term can impact prediction decisions (eg, darker color indicates higher impact).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e54653_fig04.png"/></fig></sec><sec id="s3-6-4"><title>Full-Text Searching, Uploading, and Screening</title><p>This step aims to identify full-text PDF documents for each included article and further screen their relevance based on the SLR study protocol. Only the articles that are deemed relevant after this stage will be included in the final full-text data element extraction step. The process of locating full-text PDF documents for each article can be time-consuming. The ISLR system integrates with PubMed Central to automatically find and collect full-text PDFs if they are publicly available. However, for articles whose full-text PDFs are not publicly available, users need to manually locate the articles through publishers and upload the corresponding PDFs to the system through the provided user interface.</p></sec><sec id="s3-6-5"><title>Full-Text Data Element Extraction</title><p>Extracting full-text data elements is a time-consuming process in SLR projects. It requires reviewing the full-text article and extracting multiple relevant pieces of information defined in the study protocol. These data elements are often found in various sections of an article, including tables. The ISLR system uses Amazon Textract [<xref ref-type="bibr" rid="ref24">24</xref>] for optical character recognition to extract text and tables from PDF files, followed by NLP services to further extract information from both text and tables. The NLP services can recommend potential answers for each data element, and human experts can review, select, and modify the extracted information. <xref ref-type="fig" rid="figure5">Figure 5</xref> shows a screenshot of the user interface for this step.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Full-text data element extraction user interface. Data elements from the article extracted by the NLP algorithms are color-coded and highlighted in the PDF. Highlight colors in the PDF text are linked to the data elements as shown in the right-hand frame. For the data element list on the right side, all the extracted data elements can pop up as candidates for the users to choose from. NLP: natural language processing.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e54653_fig05.png"/></fig></sec><sec id="s3-6-6"><title>Data Summary and Visualization</title><p>The ISLR system offers interactive dashboards to end users, such as researchers, for exploring the SLR results and data. These dashboards allow users to apply data filters, such as study location and cohort size, to refine their search results. For each data element extracted from full-text articles, users can click on the element to navigate to the corresponding article, ensuring traceability and appropriate references to source documents in the SLR project. Additionally, the dashboards recommend recent relevant articles and suggest articles that may require full-text screening. <xref ref-type="fig" rid="figure6">Figure 6</xref> displays the major functions and screenshots of the dashboard.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Interactive visualization of existing SLR data, lists of relevant publications, and data exportation control. SLR: systematic literature review.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e54653_fig06.png"/></fig></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>As described in the introduction, conducting an SLR is complex and expensive. There is also a rapid growth of the available number of publications and other data, such as clinical trial reports used in the article search and screening processes, with an average annual growth rate for the life sciences of around 5% [25]. Consequently, there is considerable community interest in applying various types of automation, including AI, DL, and NLP, to the multiple tasks required for producing an accurate SLR [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>An important consideration for using the results of an SLR is how often the SLR is updated and hence how timely and complete these data are with respect to the real-world evidence. &#x201C;Living&#x201D; ISLR system addresses the difficulty of updating an SLR by providing an automated workflow including review tools to detect when new data are available and to trigger at least a semi-automated update process for the expedited review. The system is also expandable to cover additional data elements of interest by updating existing NLP pipelines.</p><p>The major accomplishments of this ISLR system include improving the time, efficiency, cost, completeness of evidence, and error avoidance through techniques to assist researchers with decision-making (so-called human-in-the-loop). The ISLR system is aligned with the living SLR concept, as it supports a rapid update of existing literature data. Additionally, since the classification and data element extraction tasks are maintained by the system, results can be used for retraining the classification and NLP algorithms on a routine basis. Consequently, the performance of the system should improve over time.</p><p>The focus of this work was to evaluate an intelligent system that includes all major steps of an SLR with humans in the loop. The corpora evaluated in this study mostly focus on health economics and outcomes research in specific therapeutical areas. The generalizability of the learning algorithms to another domain will benefit from further formal examination. Since we have not yet conducted a time analysis of an SLR study conducted both manually and with this tool, we are unable to precisely quantify the time savings from the ISLR system. In addition, our NLP technologies limit to the extraction of relevant information directly from the text but are not able to conduct reasoning with long context to support complex data element extraction, such as GRADE (Grading of Recommendations, Assessment, Development, and Evaluation) or RoB2 (Risk of Bias 2). The recent advances in large language models, such as generative pretrained transformer 4, bring NLP technologies expert-level performance on various professional and academic benchmarks. Given its high performance, generalizability, and reasoning capacity, it would be interesting to further assess the efficacy and accuracy of large language models in various SLR tasks and complex data element extraction.</p><p>As an early and innovative attempt to automate SLR lifestyle through NLP technologies, ISLR does not fully support PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) reporting yet. We plan to continuously iterate ISLR to cover the PRISMA checklist and report generation in the future. In addition, we have not yet conducted formal usability studies of the user interface, although agile methods involving iterative refinement of the interface through input from domain experts in SLR were employed throughout the software development process.</p></sec><sec id="s4-2"><title>Conclusions</title><p>Our ISLR system is a user-centered, end-to-end intelligent solution to automate and accelerate the SLR process and supports &#x201C;living&#x201D; SLRs with humans in the loop. The system integrates cutting-edge ML- and DL-based NLP algorithms to make recommendations on article screening and data element extraction, which allow the system to prospectively and continuously update relevant literature in a timely fashion. This allows scientists to have more time to focus on the quality of data and the synthesis of evidence and to stay current with literature related to observational studies.</p></sec></sec></body><back><ack><p>This research was supported by Merck Sharp &#x0026; Dohme LLC, a subsidiary of Merck &#x0026; Co, Inc, Rahway, NJ.</p></ack><notes><sec><title>Disclaimer</title><p>The content is the sole responsibility of the authors and does not necessarily represent the official views of Merck &#x0026; Co, Inc, Rahway, NJ, or Melax Tech.</p></sec><sec><title>Data Availability</title><p>The annotated corpora underlying this article are available on GitHub [<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Study concept and design: JD and LY. Corpus preparation: DW, JD, and LY. Experiments: JD and BL. Draft of the manuscript: FJM, JD, DW, NC, and LY. Acquisition, analysis, or interpretation of data: JD, DW, NC, and LY. Critical revision of the manuscript for important intellectual content: all authors. Study supervision: JD, LY, and NC.</p></fn><fn fn-type="conflict"><p>DW, JC, DE, NC, PCF, and LY are employees of Merck Sharp &#x0026; Dohme LLC, a subsidiary of Merck &#x0026; Co., Inc., Rahway, NJ, USA. JD, BL, SW, XW, LH, JW, and FJM are employees of IMO.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb3">CRF</term><def><p>conditional random field</p></def></def-item><def-item><term id="abb4">DL</term><def><p>deep learning</p></def></def-item><def-item><term id="abb5">GRADE</term><def><p>Grading of Recommendations, Assessment, Development, and Evaluation</p></def></def-item><def-item><term id="abb6">HPV</term><def><p>human papillomavirus</p></def></def-item><def-item><term id="abb7">ISLR</term><def><p>intelligent systematic literature review</p></def></def-item><def-item><term id="abb8">LSTM</term><def><p>long short-term memory</p></def></def-item><def-item><term id="abb9">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb10">NER</term><def><p>named entity recognition</p></def></def-item><def-item><term id="abb11">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb12">PRISMA</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p></def></def-item><def-item><term id="abb13">RoB2</term><def><p>Risk of Bias 2</p></def></def-item><def-item><term id="abb14">SLR</term><def><p>systematic literature review</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Munn</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Stern</surname><given-names>C</given-names> </name><name name-style="western"><surname>Aromataris</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lockwood</surname><given-names>C</given-names> </name><name name-style="western"><surname>Jordan</surname><given-names>Z</given-names> </name></person-group><article-title>What kind of systematic review should I conduct? A proposed typology and guidance for systematic reviewers in the medical and health sciences</article-title><source>BMC Med Res Methodol</source><year>2018</year><month>01</month><day>10</day><volume>18</volume><issue>1</issue><fpage>5</fpage><pub-id pub-id-type="doi">10.1186/s12874-017-0468-4</pub-id><pub-id pub-id-type="medline">29316881</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsafnat</surname><given-names>G</given-names> </name><name name-style="western"><surname>Glasziou</surname><given-names>P</given-names> </name><name name-style="western"><surname>Choong</surname><given-names>MK</given-names> </name></person-group><article-title>Systematic review automation technologies</article-title><source>Syst Rev</source><year>2014</year><volume>3</volume><issue>74</issue><comment><ext-link ext-link-type="uri" xlink:href="https://link.springer.com/article/10.1186/2046-4053-3-74">https://link.springer.com/article/10.1186/2046-4053-3-74</ext-link></comment><pub-id pub-id-type="doi">10.1186/2046-4053-3-74</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="book"><person-group person-group-type="editor"><name name-style="western"><surname>Higgins</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name></person-group><source>Cochrane Handbook for Systematic Reviews of Interventions, Version 65</source><year>2024</year><access-date>2024-10-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://training.cochrane.org/handbook/current">https://training.cochrane.org/handbook/current</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Michelson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Reuter</surname><given-names>K</given-names> </name></person-group><article-title>The significant cost of systematic reviews and meta-analyses: a call for greater involvement of machine learning to assess the promise of clinical trials</article-title><source>Contemp Clin Trials Commun</source><year>2019</year><month>12</month><volume>16</volume><fpage>100443</fpage><pub-id pub-id-type="doi">10.1016/j.conctc.2019.100443</pub-id><pub-id pub-id-type="medline">31497675</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Michelson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ross</surname><given-names>M</given-names> </name><name name-style="western"><surname>Minton</surname><given-names>S</given-names> </name></person-group><article-title>AI2 leveraging machine-assistance to replicate a systematic review</article-title><source>V H</source><year>2019</year><month>05</month><volume>22</volume><fpage>S34</fpage><pub-id pub-id-type="doi">10.1016/j.jval.2019.04.006</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Del Fiol</surname><given-names>G</given-names> </name><name name-style="western"><surname>Michelson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Iorio</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cotoi</surname><given-names>C</given-names> </name><name name-style="western"><surname>Haynes</surname><given-names>RB</given-names> </name></person-group><article-title>A deep learning method to automatically identify reports of scientifically rigorous clinical research from the biomedical literature: comparative analytic study</article-title><source>J Med Internet Res</source><year>2018</year><month>06</month><day>25</day><volume>20</volume><issue>6</issue><fpage>e10281</fpage><pub-id pub-id-type="doi">10.2196/10281</pub-id><pub-id pub-id-type="medline">29941415</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Elliott</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Turner</surname><given-names>T</given-names> </name><name name-style="western"><surname>Clavisi</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Living systematic reviews: an emerging opportunity to narrow the evidence-practice gap</article-title><source>PLoS Med</source><year>2014</year><month>02</month><volume>11</volume><issue>2</issue><fpage>e1001603</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1001603</pub-id><pub-id pub-id-type="medline">24558353</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><article-title>Rayyan - Intelligent systematic review</article-title><source>Rayyan</source><year>2021</year><access-date>2024-04-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.rayyan.ai/">https://www.rayyan.ai/</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>Medical Subject Headings</article-title><source>National Library of Medicine</source><year>2024</year><access-date>2022-05-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nlm.nih.gov/mesh/meshhome.html">https://www.nlm.nih.gov/mesh/meshhome.html</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><conf-name>KDD &#x2019;16: The 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><conf-loc>San Francisco, CA</conf-loc><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noble</surname><given-names>WS</given-names> </name></person-group><article-title>What is a support vector machine?</article-title><source>Nat Biotechnol</source><year>2006</year><month>12</month><volume>24</volume><issue>12</issue><fpage>1565</fpage><lpage>1567</lpage><pub-id pub-id-type="doi">10.1038/nbt1206-1565</pub-id><pub-id pub-id-type="medline">17160063</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kleinbaum</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>M</given-names> </name></person-group><source>Logistic Regression: A Self-Learning Text</source><year>2010</year><access-date>2022-05-30</access-date><publisher-name>Springer</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://link.springer.com/book/10.1007/978-1-4419-1742-3">https://link.springer.com/book/10.1007/978-1-4419-1742-3</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pal</surname><given-names>M</given-names> </name></person-group><article-title>Random forest classifier for remote sensing classification</article-title><source>Int J Remote Sens</source><year>2005</year><volume>26</volume><issue>1</issue><fpage>217</fpage><lpage>222</lpage><pub-id pub-id-type="doi">10.1080/01431160412331269698</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nadeau</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sekine</surname><given-names>S</given-names> </name></person-group><article-title>A survey of named entity recognition and classification</article-title><source>Lingvist Investig</source><year>2007</year><month>08</month><day>15</day><volume>30</volume><issue>1</issue><fpage>3</fpage><lpage>26</lpage><pub-id pub-id-type="doi">10.1075/li.30.1.03nad</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lafferty</surname><given-names>J</given-names> </name><name name-style="western"><surname>McCallum</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pereira</surname><given-names>F</given-names> </name></person-group><article-title>Conditional random fields: probabilistic models for segmenting and labeling sequence data</article-title><year>2001</year><conf-name>CML &#x2019;01: Proceedings of the Eighteenth International Conference on Machine Learning</conf-name><conf-date>Jun 28 to Jul 1, 2001</conf-date><conf-loc>San Francisco, CA</conf-loc><fpage>282</fpage><lpage>289</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://www.cs.columbia.edu/~jebara/6772/papers/crf.pdf">http://www.cs.columbia.edu/~jebara/6772/papers/crf.pdf</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Pradhan</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Extracting formulaic and free text clinical research articles metadata using conditional random fields</article-title><source>Proceedings of the NAACL HLT 2010 Second Louhi Workshop on Text and Data Mining of Health Documents</source><year>2010</year><access-date>2022-08-07</access-date><publisher-name>Association for Computational Linguistics</publisher-name><fpage>90</fpage><lpage>95</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W10-1114">https://aclanthology.org/W10-1114</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Chiu</surname><given-names>JPC</given-names> </name><name name-style="western"><surname>Nichols</surname><given-names>E</given-names> </name></person-group><article-title>Named entity recognition with bidirectional LSTM-CNNs</article-title><source>arXiv</source><access-date>2024-10-17</access-date><comment>Preprint posted online on  Nov 26, 2015</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1511.08308">https://arxiv.org/abs/1511.08308</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Lample</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ballesteros</surname><given-names>M</given-names> </name><name name-style="western"><surname>Subramanian</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kawakami</surname><given-names>K</given-names> </name><name name-style="western"><surname>Dyer</surname><given-names>C</given-names> </name></person-group><article-title>Neural architectures for named entity recognition</article-title><source>arXiv</source><access-date>2024-10-17</access-date><comment>Preprint posted online on  Mar 4, 2016</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1603.01360">https://arxiv.org/abs/1603.01360</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Alsentzer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Boag</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Publicly available clinical BERT embeddings</article-title><source>arXiv</source><access-date>2024-10-17</access-date><comment>Preprint posted online on  Apr 6, 2019</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1904.03323">https://arxiv.org/abs/1904.03323</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/W19-1909</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><etal/></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><access-date>2024-10-17</access-date><comment>Preprint posted online on  Oct 11, 2019</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1810.04805">https://arxiv.org/abs/1810.04805</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://academic.oup.com/bioinformatics/article/36/4/1234/5566506">https://academic.oup.com/bioinformatics/article/36/4/1234/5566506</ext-link></comment><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tinn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title><source>ACM Trans Comput Healthcare</source><year>2022</year><month>01</month><day>31</day><volume>3</volume><issue>1</issue><fpage>1</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1145/3458754</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><name name-style="western"><surname>Allot</surname><given-names>A</given-names> </name><etal/></person-group><article-title>LitMC-BERT: transformer-based multi-label classification of biomedical literature with an application on COVID-19 literature curation</article-title><source>arXiv</source><access-date>2024-10-17</access-date><comment>Preprint posted online on  Apr 19, 2022</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2204.08649">https://arxiv.org/abs/2204.08649</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Amazon Textract</article-title><source>Amazon Web Services</source><access-date>2022-08-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aws.amazon.com/textract/">https://aws.amazon.com/textract/</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>Merck/NLP-SLR-corpora</article-title><source>GitHub</source><access-date>2024-10-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/Merck/NLP-SLR-corpora">https://github.com/Merck/NLP-SLR-corpora</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Inclusion and exclusion criteria for 3 systematic literature review projects.</p><media xlink:href="medinform_v12i1e54653_app1.docx" xlink:title="DOCX File, 24 KB"/></supplementary-material></app-group></back></article>