<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e64143</article-id><article-id pub-id-type="doi">10.2196/64143</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>Practical Aspects of Using Large Language Models to Screen Abstracts for Cardiovascular Drug Development: Cross-Sectional Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ronquillo</surname><given-names>Jay G</given-names></name><degrees>MPH, MMSc, MEng, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ye</surname><given-names>Jamie</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gorman</surname><given-names>Donal</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lemeshow</surname><given-names>Adina R</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Watt</surname><given-names>Stephen J</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Worldwide Medical and Safety, Pfizer Research and Development, Pfizer Inc</institution>, <addr-line>New York</addr-line><addr-line>NY</addr-line>, <country>United States</country></aff><aff id="aff2"><institution>Pfizer Research and Development UK Ltd</institution>, <addr-line>Cambridge</addr-line>, <country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Bilgin</surname><given-names>Emre</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zaghir</surname><given-names>Jamil</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jay G Ronquillo, MPH, MMSc, MEng, MD, Worldwide Medical and Safety, Pfizer Research and Development, Pfizer Inc, 66 Hudson Blvd, New York, NY, 10001, United States, 1 212-733-2323; <email>jeremiahjose.ronquillo@pfizer.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>30</day><month>9</month><year>2024</year></pub-date><volume>12</volume><elocation-id>e64143</elocation-id><history><date date-type="received"><day>09</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>29</day><month>08</month><year>2024</year></date><date date-type="accepted"><day>01</day><month>09</month><year>2024</year></date></history><copyright-statement>&#x00A9; Jay G Ronquillo, Jamie Ye, Donal Gorman, Adina R Lemeshow, Stephen J Watt. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 30.9.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2024/1/e64143"/><abstract><p>Cardiovascular drug development requires synthesizing relevant literature about indications, mechanisms, biomarkers, and outcomes. This short study investigates the performance, cost, and prompt engineering trade-offs of 3 large language models accelerating the literature screening process for cardiovascular drug development applications.</p></abstract><kwd-group><kwd>biomedical informatics</kwd><kwd>drug development</kwd><kwd>cardiology</kwd><kwd>cardio</kwd><kwd>LLM</kwd><kwd>biomedical</kwd><kwd>drug</kwd><kwd>cross-sectional study</kwd><kwd>biomarker</kwd><kwd>cardiovascular</kwd><kwd>screening optimization</kwd><kwd>GPT</kwd><kwd>large language model</kwd><kwd>AI</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Cardiovascular drug development requires synthesizing information about indications, mechanisms, biomarkers, and outcomes [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Large language models (LLMs) leveraging billions of data points could accelerate fundamental, resource-intensive aspects of this process, like screening published literature [<xref ref-type="bibr" rid="ref3">3</xref>]. However, this depends on the design, development, and implementation of LLM instructions (prompt engineering) that work effectively within the context of cardiology [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. To our knowledge, this is one of the first studies investigating LLMs to accelerate the literature screening process for cardiovascular drug development applications [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>Leveraging prior work, a PubMed query using both available Medical Subject Headings (MeSH) and the title and abstract keyword search of MeSH Entry Terms identified observational studies of heart failure that (1) were published from 2013 to 2023, (2) contained at least one relevant biomarker (brain natriuretic peptide, N-terminal pro&#x2013;atrial natriuretic peptide, N-terminal pro&#x2013;brain natriuretic peptide, and peak oxygen consumption), and (3) measured long-term outcomes (hospitalization and mortality) [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Abstracts were extracted through the PubMed application programming interface (API), and LLM instructions (prompts) were created to assess different screening optimization strategies (<xref ref-type="fig" rid="figure1">Figure 1</xref>) across LLMs (GPT-3.5 Turbo [OpenAI], GPT-4 [OpenAI], and Claude 2 [Anthropic PBC]) [<xref ref-type="bibr" rid="ref5">5</xref>]. The &#x201C;base&#x201D; LLM prompt (1) presented abstract text, (2) listed two eligibility screening criteria (ie, values found for at least one biomarker and outcome), and (3) instructed LLMs to determine if abstracts met eligibility criteria and return results in a standardized format. &#x201C;Technical&#x201D; optimization was defined as adding delimiters to the base prompt delineating key sections (abstract and criteria), while &#x201C;content&#x201D; optimization further instructed LLMs to (1) assume a scientific role and (2) address a cardiology drug development target audience [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. The different prompts used in this study are described in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Total units of text processed (&#x201C;tokens&#x201D;) were estimated using spaCy, and LLM abstract screening costs were estimated using current API prices per million input and output tokens, respectively, for GPT-3.5 (US $0.50 and US $1.50), GPT-4 (US $30 and US $60), and Claude 2 (US $8 and US $24).</p><p>A Python script performed data processing and analysis. Accuracy was assessed by comparing LLM outputs against manual epidemiologist review of study suitability for inclusion, with descriptive statistics calculated for each LLM and prompt type. Performance differences between fully optimized prompts (GPT-3.5 vs GPT-4, GPT-3.5 vs Claude 2, and GPT-4 vs Claude 2) were evaluated using the chi-square test. A <italic>P</italic> value of &#x003C;.05 was considered statistically significant.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Biomedical informatics pipeline for comparing different LLM and prompt optimization approaches to abstract screening for cardiovascular drug development. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e64143_fig01.png"/></fig></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study did not meet the definition of human participants research and thus did not require institutional review board approval.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Of 69 articles found in PubMed, 32 (46%) met eligibility criteria after manual review; corresponding LLM screening accuracies are summarized in <xref ref-type="table" rid="table1">Table 1</xref>. By LLM, the best performances came from the base prompt (GPT-3.5), technical and combined prompts (GPT-4), and technical prompts (Claude 2). Overall, combined prompts for GPT-3.5 and GPT 4 performed similarly against each other (<italic>P</italic>&#x003E;.99) and against Claude 2 (<italic>P</italic>=.61 against both).</p><p>GPT-3.5 processed a total of 124,826 tokens, while GPT-4 and Claude 2 processed 14.4% (N=142,750) and 15.9% (N=144,703) more tokens, respectively. Total costs for GPT-4 (US $4.89) and Claude 2 (US $1.52) were 75.4 and 23.4 times higher, respectively, than total costs for GPT-3.5 (US $0.06).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Abstract screening accuracies reflecting total abstracts correctly identified by large language models (LLMs) for inclusion and exclusion based on manual review of study suitability, by LLM and prompt optimization type (abstracts: N=69).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Prompt optimization type</td><td align="left" valign="bottom" colspan="3">Accuracy, n (%)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">GPT-4</td><td align="left" valign="bottom">Claude 2</td></tr></thead><tbody><tr><td align="left" valign="top">Base (none)</td><td align="left" valign="top">43 (62)</td><td align="left" valign="top">40 (58)</td><td align="left" valign="top">35 (51)</td></tr><tr><td align="left" valign="top">Technical</td><td align="left" valign="top">34 (49)</td><td align="left" valign="top">41 (59)</td><td align="left" valign="top">43 (62)</td></tr><tr><td align="left" valign="top">Content</td><td align="left" valign="top">42 (61)</td><td align="left" valign="top">38 (55)</td><td align="left" valign="top">38 (55)</td></tr><tr><td align="left" valign="top">Technical and content</td><td align="left" valign="top">41 (59)</td><td align="left" valign="top">41 (59)</td><td align="left" valign="top">37 (54)</td></tr></tbody></table></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>Despite the complex and limited public cardiology data integrated into LLMs, our findings were consistent with similar studies for oncology and current LLM abilities to pass medical licensing exams [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Performance could be further improved by adding specific examples to the prompt (few-shot prompting) or to the LLM training data (fine-tuning) [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>Technical optimizations showed modest performance improvements across some LLMs, indicating one practical way to improve accuracy and prompt readability without significantly expanding the size of input prompts. Standardizing outputs helped generate valid responses, although GPT-4 and Claude 2 still had higher costs as a result of more verbose output. Enterprise LLM&#x2013;based abstract screening will require balancing prompt performance, cost, and complexity with cardiology subject matter expert capabilities and workflows.</p><p>Limitations include a small cardiovascular dataset leveraging proprietary LLMs and only a subset of available optimization techniques. Future efforts must engage diverse scientific communities; develop guardrails to ensure safe and responsible LLM use; and apply data-driven best practices that generalize, optimize, and validate LLM applications and their impact on patients with cardiovascular disease.</p></sec></body><back><fn-group><fn fn-type="conflict"><p>All authors are employees of Pfizer. The funding sources had no role in the design and conduct of this study; collection, management, analysis, and interpretation of the data; preparation, review, or approval of the manuscript; and decision to submit the manuscript for publication.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">MeSH</term><def><p>Medical Subject Headings</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boonstra</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Weissenbacher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Moore</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Gonzalez-Hernandez</surname><given-names>G</given-names> </name><name name-style="western"><surname>Asselbergs</surname><given-names>FW</given-names> </name></person-group><article-title>Artificial intelligence: revolutionizing cardiology with large language models</article-title><source>Eur Heart J</source><year>2024</year><month>02</month><day>1</day><volume>45</volume><issue>5</issue><fpage>332</fpage><lpage>345</lpage><pub-id pub-id-type="doi">10.1093/eurheartj/ehad838</pub-id><pub-id pub-id-type="medline">38170821</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wessler</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Kramer</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Kelly</surname><given-names>JL</given-names> </name><etal/></person-group><article-title>Drug and device effects on peak oxygen consumption, 6-minute walk distance, and natriuretic peptides as predictors of therapeutic effects on mortality in patients with heart failure and reduced ejection fraction</article-title><source>Circ Heart Fail</source><year>2011</year><month>09</month><volume>4</volume><issue>5</issue><fpage>578</fpage><lpage>588</lpage><pub-id pub-id-type="doi">10.1161/CIRCHEARTFAILURE.111.961573</pub-id><pub-id pub-id-type="medline">21705485</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reason</surname><given-names>T</given-names> </name><name name-style="western"><surname>Benbow</surname><given-names>E</given-names> </name><name name-style="western"><surname>Langham</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gimblett</surname><given-names>A</given-names> </name><name name-style="western"><surname>Klijn</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Malcolm</surname><given-names>B</given-names> </name></person-group><article-title>Artificial intelligence to automate network meta-analyses: four case studies to evaluate the potential application of large language models</article-title><source>Pharmacoecon Open</source><year>2024</year><month>03</month><volume>8</volume><issue>2</issue><fpage>205</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.1007/s41669-024-00476-9</pub-id><pub-id pub-id-type="medline">38340277</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ferber</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wiest</surname><given-names>IC</given-names> </name><name name-style="western"><surname>W&#x00F6;lflein</surname><given-names>G</given-names> </name><etal/></person-group><article-title>GPT-4 for information retrieval and comparison of medical oncology guidelines</article-title><source>NEJM AI</source><year>2024</year><month>05</month><day>17</day><volume>1</volume><issue>6</issue><pub-id pub-id-type="doi">10.1056/AIcs2300235</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kelley</surname><given-names>M</given-names> </name><name name-style="western"><surname>Samolyk-Mazzanti</surname><given-names>A</given-names> </name><name name-style="western"><surname>Visweswaran</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>An empirical evaluation of prompting strategies for large language models in zero-shot clinical natural language processing: algorithm development and validation study</article-title><source>JMIR Med Inform</source><year>2024</year><month>04</month><day>8</day><volume>12</volume><fpage>e55318</fpage><pub-id pub-id-type="doi">10.2196/55318</pub-id><pub-id pub-id-type="medline">38587879</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Medapalli</surname><given-names>T</given-names> </name><name name-style="western"><surname>Alexandrou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Brilakis</surname><given-names>E</given-names> </name><name name-style="western"><surname>Prasad</surname><given-names>A</given-names> </name></person-group><article-title>Exploring the role of ChatGPT in cardiology: a systematic review of the current literature</article-title><source>Cureus</source><year>2024</year><month>04</month><day>24</day><volume>16</volume><issue>4</issue><fpage>e58936</fpage><pub-id pub-id-type="doi">10.7759/cureus.58936</pub-id><pub-id pub-id-type="medline">38800264</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zaghir</surname><given-names>J</given-names> </name><name name-style="western"><surname>Naguib</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bjelogrlic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Neveol</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tannier</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lovis</surname><given-names>C</given-names> </name></person-group><article-title>Prompt engineering paradigms for medical applications: scoping review and recommendations for better practices</article-title><source>arXiv</source><comment>Preprint posted online on  May 2, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.01249</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sahoo</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Plasek</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Large language models for biomedicine: foundations, opportunities, challenges, and best practices</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>2114</fpage><lpage>2124</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae074</pub-id><pub-id pub-id-type="medline">38657567</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><access-date>2024-09-18</access-date><conf-name>34th Conference on Neural Information Processing Systems (NeurIPS 2020)</conf-name><conf-date>Dec 6-12, 2020</conf-date><conf-loc>Vancouver, Canada</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf">https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Approach for creating prompts focused on abstract screening for cardiovascular drug development, starting with the base prompt (black) and including content optimization (A) and technical optimization (B-E).</p><media xlink:href="medinform_v12i1e64143_app1.png" xlink:title="PNG File, 404 KB"/></supplementary-material></app-group></back></article>