<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e64682</article-id><article-id pub-id-type="doi">10.2196/64682</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>GPT-3.5 Turbo and GPT-4 Turbo in Title and Abstract Screening for Systematic Reviews</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Oami</surname><given-names>Takehiko</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Okada</surname><given-names>Yohei</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nakada</surname><given-names>Taka-aki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Emergency and Critical Care Medicine, Chiba University Graduate School of Medicine</institution><addr-line>1-8-1 Inohana, Chuo</addr-line><addr-line>Chiba</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Preventive Services, Kyoto University Graduate School of Medicine</institution><addr-line>Kyoto</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Health Services and Systems Research, Duke-NUS Medical School, National University of Singapore</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Castonguay</surname><given-names>Alexandre</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Shung</surname><given-names>Dennis</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Subhas</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Takehiko Oami, MD, PhD, Department of Emergency and Critical Care Medicine, Chiba University Graduate School of Medicine, 1-8-1 Inohana, Chuo, Chiba, 260-8677, Japan, 81 432262372; <email>seveneleven711thanks39@msn.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>12</day><month>3</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e64682</elocation-id><history><date date-type="received"><day>23</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>15</day><month>01</month><year>2025</year></date><date date-type="accepted"><day>28</day><month>01</month><year>2025</year></date></history><copyright-statement>&#x00A9; Takehiko Oami, Yohei Okada, Taka-aki Nakada. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 12.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e64682"/><abstract><p>This study demonstrated that while GPT-4 Turbo had superior specificity when compared to GPT-3.5 Turbo (0.98 vs 0.51), as well as comparable sensitivity (0.85 vs 0.83), GPT-3.5 Turbo processed 100 studies faster (0.9 min vs 1.6 min) in citation screening for systematic reviews, suggesting that GPT-4 Turbo may be more suitable due to its higher specificity and highlighting the potential of large language models in optimizing literature selection.</p></abstract><kwd-group><kwd>large language models</kwd><kwd>citation screening</kwd><kwd>systematic review</kwd><kwd>clinical practice guidelines</kwd><kwd>artificial intelligence</kwd><kwd>sepsis</kwd><kwd>AI</kwd><kwd>review</kwd><kwd>GPT</kwd><kwd>screening</kwd><kwd>citations</kwd><kwd>critical care</kwd><kwd>Japan</kwd><kwd>Japanese</kwd><kwd>accuracy</kwd><kwd>efficiency</kwd><kwd>reliability</kwd><kwd>LLM</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Systematic reviews are essential in guideline development. Manual citation screening, however, is a time-consuming and labor-intensive process, often resulting in human errors and increased workloads [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Large language models (LLMs) have demonstrated the ability to comprehend and process natural language, underscoring their utility in medical applications [<xref ref-type="bibr" rid="ref3">3</xref>]. Consequently, LLMs have emerged as promising tools for citation screening in systematic reviews [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>LLMs, including GPT, Gemini, and Claude, could serve as secondary reviewers in title and abstract screening, with the downsides of needing to reconcile false positives and potentially missing some relevant citations [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Although more advanced LLMs are expected to outperform previous models in sensitivity, specificity, and efficiency [<xref ref-type="bibr" rid="ref9">9</xref>], the full impact of model development in citation screening remains to be fully understood.</p><p>This study aimed to compare accuracy and efficiency between GPT-3.5 Turbo and GPT-4 Turbo (OpenAI)&#x2014;widely used LLMs in the medical field&#x2014;in title and abstract screening.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>We conducted a post hoc analysis of our previous study to evaluate the performance of GPT-3.5 Turbo and GPT-4 Turbo in LLM-assisted title and abstract screening, using data from 5 clinical questions (CQs) developed for the Japanese Clinical Practice Guidelines for Management of Sepsis and Septic Shock 2024 [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. The two models determined the relevance of each reference based on patient characteristics, interventions, comparisons, and study designs specific to the selected CQs (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). LLM-assisted screening was conducted by using Python (v3.9.0) and the OpenAI application programming interface. The same prompt&#x2014;optimized to increase sensitivity from our previous study&#x2014;was applied to both models (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Evaluation metrics were expressed as sensitivity and specificity with 95% CIs, using the final list of included studies in the conventional review as the reference standard. These measures were aggregated to estimate the pooled sensitivity and specificity of LLM-assisted procedures. Additionally, we measured the time taken by each model to screen 100 studies. Further analysis details are available in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. LLM-assisted citation screening was conducted between June 6 and 7, 2024. STARD (Standards for Reporting of Diagnostic Accuracy) guidelines were followed.</p></sec><sec id="s3" sec-type="results"><title>Results</title><p>In the conventional citation screening process, 0.24% (41/16,669) of citations for 5 CQs were selected during the full-text evaluation. GPT-3.5 Turbo exhibited a combined sensitivity and specificity of 0.83 (95% CI 0.67&#x2010;0.92) and 0.51 (95% CI 0.39&#x2010;0.63), respectively (<xref ref-type="fig" rid="figure1">Figure 1</xref>). In contrast, GPT-4 Turbo demonstrated greater performance, with a sensitivity and specificity of 0.85 (95% CI 0.63&#x2010;0.95) and 0.98 (95% CI 0.97&#x2010;0.99), respectively (<xref ref-type="fig" rid="figure1">Figure 1</xref>, Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). A significant difference was found in specificity between both models (median difference 0.48, 95% CI 0.29 to 0.62) but not in sensitivity (median difference &#x2212;0.06, 95% CI &#x2212;0.50 to 0.23; Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). GPT-3.5 Turbo processed 100 studies faster than GPT-4 Turbo (0.9 min vs 1.6 min, respectively; mean difference 0.69, 95% CI 0.53-0.86 min; <xref ref-type="fig" rid="figure2">Figure 2</xref>, Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Comparison of GPT-3.5 Turbo&#x2019;s and GPT-4 Turbo&#x2019;s accuracy in citation screening. The results of the included publications were qualitatively analyzed, using the conventional method as the standard reference. The individual sensitivity and specificity for each CQ and the integrated sensitivities and specificities across CQs 1 to 5 were compared between GPT-3.5 Turbo (A and B) and GPT-4 Turbo (C and D), with 95% CIs and inconsistency values (<italic>I</italic><sup>2</sup>). CQ: clinical question.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e64682_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Comparison of citation screening time for 100 studies between GPT-3.5 Turbo and GPT-4 Turbo. The difference in processing time was 0.69 (95% CI 0.53-0.86) min. An unpaired, 2-tailed <italic>t</italic> test was used for analysis. CQ: clinical question. *Statistically significant at <italic>P</italic>&#x003C;.001.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e64682_fig02.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>Our analysis showed that GPT-4 Turbo had similar sensitivity to but higher specificity than GPT-3.5 Turbo, with minimal impact on screening speed. The high specificity of GPT-4 Turbo is crucial for reducing workloads in subsequent review phases by minimizing the inclusion of irrelevant studies. Although GPT-3.5 Turbo demonstrated shorter screening times, its lower specificity may increase review times. Given the trade-off relationship between sensitivity and specificity, LLM users should choose the optimal model according to their situations.</p><p>Our findings emphasize the impact of LLMs&#x2019; development on their performance for citation screening and the need to reinforce a model&#x2019;s suitability for accurate and reliable citation screening [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Although LLMs are promising tools for title and abstract screening in systematic reviews [<xref ref-type="bibr" rid="ref4">4</xref>], caution is warranted until further investigations validate their reliability in real-world applications.</p><p>This study has several limitations. First, the focus on sepsis limits the generalizability of the findings. Further validation with diverse datasets in other medical domains would enhance the robustness of our conclusions. Second, the post hoc nature of this study may have introduced selection bias. Third, evaluation metrics depend on the reference standard. Fourth, this study did not investigate other LLMs or prompts created via prompt engineering, which could have improved performance. Fifth, the results were based on the LLMs available at the time of analysis. Future investigations should use OpenAI o1 or newer models.</p><p>In conclusion, GPT-4 Turbo demonstrated higher specificity than and similar sensitivity to GPT-3.5 Turbo, making GPT-4 Turbo more suitable for systematic reviews, despite having slightly longer processing times.</p></sec></body><back><ack><p>We would like to thank all contributors to the Japanese Society of Intensive Care Medicine and the Japanese Association of Emergency Medicine. YO thanks the Japan Society for the Promotion of Science Overseas Research Fellowships. YO received a research grant from the ZOLL Foundation and overseas scholarships from the FUKUDA Foundation for Medical Technology and International Medical Research Foundation. YO was supported by the KPFA (Khoo Postdoctoral Fellowship Award) fellowship (Duke-NUS-KPFA/2024/0073). These organizations had no role in conducting this research. The authors received no specific funding for this work. We thank Honyaku Center Inc for English language editing.</p></ack><fn-group><fn fn-type="con"><p>TO, YO, and TN contributed to the study concept and design, statistical analysis and interpretation of data, drafting of the manuscript, and critical revision of the manuscript for important intellectual content. TO performed the computation to extract the necessary data. All the authors have read and approved the final version of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CQ</term><def><p>clinical question</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language models</p></def></def-item><def-item><term id="abb3">STARD </term><def><p>Standards for Reporting of Diagnostic Accuracy</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Nayfeh</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tetzlaff</surname><given-names>J</given-names> </name><name name-style="western"><surname>O&#x2019;Blenis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Murad</surname><given-names>MH</given-names> </name></person-group><article-title>Error rates of human reviewers during abstract screening in systematic reviews</article-title><source>PLoS One</source><year>2020</year><month>01</month><day>14</day><volume>15</volume><issue>1</issue><fpage>e0227742</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0227742</pub-id><pub-id pub-id-type="medline">31935267</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>O&#x2019;Hearn</surname><given-names>K</given-names> </name><name name-style="western"><surname>MacDonald</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tsampalieros</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Evaluating the relationship between citation set size, team size and screening methods used in systematic reviews: a cross-sectional study</article-title><source>BMC Med Res Methodol</source><year>2021</year><month>07</month><day>8</day><volume>21</volume><issue>1</issue><fpage>142</fpage><pub-id pub-id-type="doi">10.1186/s12874-021-01335-5</pub-id><pub-id pub-id-type="medline">34238247</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Potential roles of large language models in the production of systematic reviews and meta-analyses</article-title><source>J Med Internet Res</source><year>2024</year><month>06</month><day>25</day><volume>26</volume><fpage>e56780</fpage><pub-id pub-id-type="doi">10.2196/56780</pub-id><pub-id pub-id-type="medline">38819655</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tran</surname><given-names>VT</given-names> </name><name name-style="western"><surname>Gartlehner</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yaacoub</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Sensitivity and specificity of using GPT-3.5 Turbo models for title and abstract screening in systematic reviews and meta-analyses</article-title><source>Ann Intern Med</source><year>2024</year><month>06</month><volume>177</volume><issue>6</issue><fpage>791</fpage><lpage>799</lpage><pub-id pub-id-type="doi">10.7326/M23-3389</pub-id><pub-id pub-id-type="medline">38768452</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oami</surname><given-names>T</given-names> </name><name name-style="western"><surname>Okada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nakada</surname><given-names>TA</given-names> </name></person-group><article-title>Performance of a large language model in screening citations</article-title><source>JAMA Netw Open</source><year>2024</year><month>07</month><day>1</day><volume>7</volume><issue>7</issue><fpage>e2420496</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.20496</pub-id><pub-id pub-id-type="medline">38976267</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dennst&#x00E4;dt</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zink</surname><given-names>J</given-names> </name><name name-style="western"><surname>Putora</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Hastings</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cihoric</surname><given-names>N</given-names> </name></person-group><article-title>Title and abstract screening for literature reviews using large language models: an exploratory study in the biomedical domain</article-title><source>Syst Rev</source><year>2024</year><month>06</month><day>15</day><volume>13</volume><issue>1</issue><fpage>158</fpage><pub-id pub-id-type="doi">10.1186/s13643-024-02575-4</pub-id><pub-id pub-id-type="medline">38879534</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>X</given-names> </name></person-group><article-title>Evaluating the effectiveness of large language models in abstract screening: a comparative analysis</article-title><source>Syst Rev</source><year>2024</year><month>08</month><day>21</day><volume>13</volume><issue>1</issue><fpage>219</fpage><pub-id pub-id-type="doi">10.1186/s13643-024-02609-x</pub-id><pub-id pub-id-type="medline">39169386</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Matsui</surname><given-names>K</given-names> </name><name name-style="western"><surname>Utsumi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Aoki</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Maruki</surname><given-names>T</given-names> </name><name name-style="western"><surname>Takeshima</surname><given-names>M</given-names> </name><name name-style="western"><surname>Takaesu</surname><given-names>Y</given-names> </name></person-group><article-title>Human-comparable sensitivity of large language models in identifying eligible studies through title and abstract screening: 3-layer strategy using GPT-3.5 and GPT-4 for systematic reviews</article-title><source>J Med Internet Res</source><year>2024</year><month>08</month><day>16</day><volume>26</volume><fpage>e52758</fpage><pub-id pub-id-type="doi">10.2196/52758</pub-id><pub-id pub-id-type="medline">39151163</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Egi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ogura</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yatabe</surname><given-names>T</given-names> </name><etal/></person-group><article-title>The Japanese Clinical Practice Guidelines for Management of Sepsis and Septic Shock 2020 (J-SSCG 2020)</article-title><source>J Intensive Care</source><year>2021</year><month>08</month><day>25</day><volume>9</volume><issue>1</issue><fpage>53</fpage><pub-id pub-id-type="doi">10.1186/s40560-021-00555-7</pub-id><pub-id pub-id-type="medline">34433491</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary content regarding the clinical questions, the conventional citation screening, the command prompt used, the automated implementation of the citation screening process, and further data on the comparisons conducted.</p><media xlink:href="medinform_v13i1e64682_app1.docx" xlink:title="DOCX File, 110 KB"/></supplementary-material></app-group></back></article>