<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e76128</article-id><article-id pub-id-type="doi">10.2196/76128</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Comparative Analysis of Generative Artificial Intelligence Systems in Solving Clinical Pharmacy Problems: Mixed Methods Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Lulu</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Du</surname><given-names>Pengqiang</given-names></name><degrees>MMSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Huang</surname><given-names>Xiaojing</given-names></name><degrees>MMSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhao</surname><given-names>Hongwei</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ni</surname><given-names>Ming</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yan</surname><given-names>Meng</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wang</surname><given-names>Aifeng</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Department of Pharmacy, Fuwai Central China Cardiovascular Hospital</institution><addr-line>1 Fuwai Road, Zhengdong New District</addr-line><addr-line>Zhengzhou</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Marcaccini</surname><given-names>Gianluca</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>Jinghong</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Alshiekh</surname><given-names>Mona</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Aifeng Wang, MSc, Department of Pharmacy, Fuwai Central China Cardiovascular Hospital, 1 Fuwai Road, Zhengdong New District, Zhengzhou, China, 86 18538298379; <email>waf112128@163.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>24</day><month>7</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e76128</elocation-id><history><date date-type="received"><day>16</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>17</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>17</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9;Lulu Li, Pengqiang Du, Xiaojing Huang, Hongwei Zhao, Ming Ni, Meng Yan, Aifeng Wang. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 24.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e76128"/><abstract><sec><title>Background</title><p>Generative artificial intelligence (AI) systems are increasingly deployed in clinical pharmacy; yet, systematic evaluation of their efficacy, limitations, and risks across diverse practice scenarios remains limited.</p></sec><sec><title>Objective</title><p>This study aims to quantitatively evaluate and compare the performance of 8 mainstream generative AI systems across 4 core clinical pharmacy scenarios&#x2014;medication consultation, medication education, prescription review, and case analysis with pharmaceutical care&#x2014;using a multidimensional framework.</p></sec><sec sec-type="methods"><title>Methods</title><p>Forty-eight clinically validated questions were selected via stratified sampling from real-world sources (eg, hospital consultations, clinical case banks, and national pharmacist training databases). Three researchers simultaneously tested 8 different generative AI systems (ERNIE Bot, Doubao, Kimi, Qwen, GPT-4o, Gemini-1.5-Pro, Claude-3.5-Sonnet, and DeepSeek-R1) using standardized prompts within a single day (February 20, 2025). A double-blind scoring design was used, with 6 experienced clinical pharmacists (&#x2265;5 years experience) evaluating the AI responses across 6 dimensions: accuracy, rigor, applicability, logical coherence, conciseness, and universality, scored 0&#x2010;10 per predefined criteria (eg, &#x2212;3 for inaccuracy and &#x2212;2 for incomplete rigor). Statistical analysis used one-way ANOVA with Tukey Honestly Significant Difference (HSD) post hoc testing and intraclass correlation coefficients (ICC) for interrater reliability (2-way random model). Qualitative thematic analysis identified recurrent errors and limitations.</p></sec><sec sec-type="results"><title>Results</title><p>DeepSeek-R1 (DeepSeek) achieved the highest overall performance (mean composite score: medication consultation 9.4, SD 1.0; case analysis 9.3, SD 1.0), significantly outperforming others in complex tasks (<italic>P</italic>&#x003C;.05). Critical limitations were observed across models, including high-risk decision errors&#x2014;75% omitted critical contraindications (eg, ethambutol in optic neuritis) and a lack of localization&#x2014;90% erroneously recommended macrolides for drug-resistant <italic>Mycoplasma pneumoniae</italic> (China&#x2019;s high-resistance setting), while only DeepSeek-R1 aligned with updated American Academy of Pediatrics (AAP) guidelines for pediatric doxycycline. Complex reasoning deficits: only Claude-3.5-Sonnet detected a gender-diagnosis contradiction (prostatic hyperplasia in female); no model identified diazepam&#x2019;s 7-day prescription limit. Interrater consistency was lowest for conciseness in case analysis (ICC=0.70), reflecting evaluator disagreement on complex outputs. ERNIE Bot (Baidu) consistently underperformed (case analysis: 6.8, SD 1.5; <italic>P</italic>&#x003C;.001 vs DeepSeek-R1).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>While generative AI shows promise as a pharmacist assistance tool, significant limitations&#x2014;including high-risk errors (eg, contraindication omissions), inadequate localization, and complex reasoning gaps&#x2014;preclude autonomous clinical decision-making. Performance stratification highlights DeepSeek-R1&#x2019;s current advantage, but all systems require optimization in dynamic knowledge updating, complex scenario reasoning, and output interpretability. Future deployment must prioritize human oversight (human-AI co-review), ethical safeguards, and continuous evaluation frameworks.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>DeepSeek-R1</kwd><kwd>clinical pharmacy</kwd><kwd>comparative analysis</kwd><kwd>generative AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>With the breakthrough development of generative artificial intelligence (AI) technology, the health care field is experiencing a significant transformation, with AI-driven pharmaceutical practice at the forefront of this evolution [<xref ref-type="bibr" rid="ref1">1</xref>]. Pharmaceutical intelligence has the potential to transform pharmaceutical practice by addressing the complexity of drug data, evolving health care needs, and technological advancements. Global research indicates that AI demonstrates transregional universality in enhancing the efficiency of drug information retrieval [<xref ref-type="bibr" rid="ref2">2</xref>]. Clinical studies indicate that such systems demonstrate significant advantages in efficient drug information retrieval and exhibit a certain degree of accuracy and specificity in predicting drug interactions [<xref ref-type="bibr" rid="ref3">3</xref>]. However, recent systematic reviews point out that current AI chatbots primarily face challenges of &#x201C;generating inaccurate or fabricated content&#x201D; and &#x201C;lower accuracy in answering questions&#x201D; [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. This gap between technological potential and practical application [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>] highlights the urgency of establishing scientific evaluation systems to identify high-quality generative AI systems.</p><p>Despite some existing research beginning to conduct clinical application assessments of generative AI dialogue systems, these efforts are largely limited to testing individual models on single tasks [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>], lacking horizontal comparative analysis across multiple dialogue models and validation of continuous decision chains in real clinical scenarios.</p><p>This research innovatively constructs a 6-dimensional evaluation system, conducting systematic assessment and comparative analysis of 4 types of clinical pharmacy practice scenarios: medication consultation, medication education, prescription review, and case analysis with pharmaceutical care. The study sample encompasses 8 representative mainstream dialogue-based AI platforms from both domestic and international origins: ERNIE Bot (version 4.0; Baidu; Release Date: October 17, 2023), Doubao (version: Pro; ByteDance; Release Date: May 15, 2024), Kimi (version: V1.1; Beijing Moonshot Technology Co., Ltd.; Release Date: November 16, 2023), Qwen (version: long; Alibaba Cloud; Release Date: May 21, 2024), GPT (version: 4o; OpenAI; Release Date: May 14, 2024), Gemini (version: 1.5-Pro; Google DeepMind; Release Date: February 14, 2024), Claude (version: 3.5-Sonnet; Anthropic; Release Date: June 21, 202), and DeepSeek (version: R1; Hangzhou DeepSeek Artificial Intelligence Basic Technology Research Co, Ltd; Release Date: January 20, 2025). Through designing parallel tests in realistic clinical settings and using a modified Delphi method for double-blind evaluation, we quantitatively analyzed and descriptively evaluated the capabilities of these 8 generative AI dialogue systems in addressing clinical pharmacy problems across 6 dimensions: accuracy, rigor, applicability, logical coherence, conciseness, and universality. The research findings will provide empirical evidence for optimizing the application of generative AI systems in clinical pharmacy and offer valuable reference for constructing AI-assisted decision-making systems that conform to medical ethics.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Research Design</title><p>We collected 48 common questions from 4 categories of clinical pharmacy work scenarios:</p><list list-type="order"><list-item><p>Medication consultation questions (n=20) content covered 10 aspects, with 2 questions per aspect: drug indications (efficacy), administration methods, dosage, medication precautions, drug interactions, storage methods, identification and management of adverse drug reactions, special dosage form usage guidance, medication use in special populations, and disease prevention.</p></list-item><list-item><p>Medication education questions (n=10) content primarily covered medication use for patients with chronic diseases and special populations.</p></list-item><list-item><p>Prescription audit questions (n=10) content encompassed inappropriate treatment regimens, inappropriate usage and dosage, inappropriate combination therapy, inappropriate drug selection, inappropriate administration routes, contraindications, inappropriate treatment duration, and inappropriate clinical diagnosis. The task required assuming the role of a pharmacist to identify prescription errors using relevant pharmacotherapeutic knowledge and the latest clinical guidelines.</p></list-item><list-item><p>Case analysis and pharmaceutical care questions (n=8) content included common chronic disease cases such as coronary heart disease, hypertension, type 2 diabetes, asthma, chronic obstructive pulmonary disease, gout, lung cancer, and so on. The task required assuming the role of a pharmacist to analyze pharmacotherapy plans based on patient information (basic information, reason for visit, present illness history, past medical history, medication history, family history, allergy history, adverse reaction history, unhealthy habits, diagnosis, current medication records, and auxiliary examination results), and to develop a pharmaceutical care plan addressing 4 aspects: indications, effectiveness, safety, and adherence.</p></list-item></list><p>The study used a standardized experimental design, with 3 researchers using identical &#x201C;inquiry prompts&#x201D; to question 8 generative AI dialogue systems during the same time period. All models were tested using their publicly available versions as of February 20, 2025; the results reflect a performance snapshot restricted to this timepoint. Each chatbot received 48 inquiry prompts, generating a total of 384 independent response samples. The evaluation was conducted by 6 clinical pharmacists who had successfully obtained clinical pharmacist training certificates after standardized training and had more than 5 years of clinical pharmacy work experience. The evaluation encompassed six dimensions: (1) accuracy, (2) rigor, (3) applicability, (4) logical coherence, (5) conciseness, and (6) universality. These clinical pharmacists&#x2019; professional domains covered all disease types relevant to the questions. A double-blind scoring mechanism was implemented for independent evaluation to avoid subjective bias and ensure objectivity and fairness in the assessment process.</p><p>Standardized prompting instructions: all questions were input to the model using a standardized format. The core instruction template was to act in the role of a clinical pharmacist. Based on the latest clinical guidelines and evidence-based principles, answer the following question (Specific question description). For prescription review tasks, the following emphasis was added: determine whether this prescription contains errors and provide your rationale. For case analysis and pharmaceutical care tasks, the following emphasis was added: analyze the pharmacotherapy plan for this case and develop a pharmaceutical care plan addressing the following 4 aspects &#x201C;Indication, Efficacy, Safety, and Adherence.&#x201D; The complete list of all 48 standardized prompting instructions used in this study is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-2"><title>Data Sources</title><p>Questions were collected using stratified sampling, drawing from the following sources: common medication inquiries at a medication consultation clinic in a large Grade A Class 3 hospital in China, real clinical cases, the theoretical assessment question banks for standardized clinical pharmacist training programs of the Chinese Medical Association (CMA) and the Chinese Hospital Association (CHA), the China Medication Therapy Management (MTM) pharmacist training program, and the China Pharmacist Skills Competition. Specific steps are as follows:</p><p>Question bank construction: questions from the 5 sources above were categorized into 4 scenarios: medication consultation, medication education, prescription review, and case analysis and pharmaceutical care.</p><p>Stratified sampling:</p><list list-type="bullet"><list-item><p>Medication consultation (20 questions): ensured coverage of the 10 aspects mentioned in the Abstract (medication indications, administration methods, dosage, precautions, drug-drug interactions, storage methods, identification and management of adverse drug reactions, guidance on special dosage forms, medication use in special populations, and disease prevention). Two questions were randomly selected from each aspect.</p></list-item><list-item><p>Medication education (10 questions): randomly selected from 2 subcategories, medication use for patients with chronic disease (6 questions), and medication use for special populations (4 questions).</p></list-item><list-item><p>Prescription review (10 questions): ensured coverage of the 8 types of inappropriate prescriptions mentioned in the Abstract (inappropriate therapeutic regimen, inappropriate dosage and administration, inappropriate combination therapy, inappropriate drug selection, inappropriate route of administration, presence of contraindications, inappropriate duration of therapy, and inconsistency with clinical diagnosis). Prescription cases containing typical or high-risk errors were prioritized.</p></list-item><list-item><p>Case analysis and pharmaceutical care (8 questions): 8 representative cases were randomly selected from the question bank covering common chronic diseases (eg, coronary heart disease, hypertension, type 2 diabetes, asthma, chronic obstructive pulmonary disease, gout, and lung cancer).</p></list-item></list><p>Question screening: the initially screened questions were independently reviewed by 2 senior clinical pharmacists. This process ensured the questions aligned with the assessment objectives, possessed clinical relevance, were clear and unambiguous, and excluded duplicate or outdated items. Ultimately, 48 questions were finalized for inclusion in the study.</p></sec><sec id="s2-3"><title>Evaluation Methods</title><p>Scoring standards were constructed based on evidence-based medical references, including drug package inserts, the latest clinical guidelines, and the Micromedex database. Clinical pharmacists conducted quantitative assessments of response contents according to standard answers and scoring criteria, with accuracy levels represented on a 0&#x2010; to 10-point scale. Unified scoring rules were established, with each question having a maximum score of 10 points, with deductions as follows:</p><list list-type="order"><list-item><p>Accuracy: 3 points deducted for not directly addressing the question or not providing an accurate answer;</p></list-item><list-item><p>Rigor: 2 points deducted for incomplete answers;</p></list-item><list-item><p>Applicability: 2 points deducted for failing to provide individualized recommendations based on patient-specific conditions;</p></list-item><list-item><p>Logical coherence: 1 point deducted for unclear reasoning or disorganized logic;</p></list-item><list-item><p>Conciseness: 1 point deducted for verbose language;</p></list-item><list-item><p>Universality: 1 point deducted for overly technical terminology lacking general applicability;</p></list-item><list-item><p>Comprehensive answers without deduction criteria: 0 points deducted.</p></list-item></list><p>Evaluators were required to select at least one scoring option during the assessment process.</p></sec><sec id="s2-4"><title>Statistical Analysis</title><p>One-way ANOVA was used to compare score differences among the 8 generative AI dialogue systems across 4 task categories (medication consultation, medication education, prescription review, and case analysis with pharmaceutical care). Data satisfied the assumptions of normality through Shapiro&#x2013;Wilk tests (<italic>P</italic>&#x003E;.05) and homogeneity of variance through Levene tests (<italic>P</italic>&#x003E;.05), followed by Tukey Honestly Significant Difference (HSD) multiple comparison analysis to identify significant differences. Interrater consistency was calculated using the intraclass correlation coefficient (ICC) through a 2-way random effects model, with ICC&#x003E;0.75 indicating good consistency.</p></sec><sec id="s2-5"><title>Qualitative Analysis Process</title><p>Following quantitative scoring, the research team (3 investigators) conducted a systematic content review of all 384 AI-generated responses to identify recurrent strengths, typical error patterns, critical limitations, and notable variations across models. The review process followed the steps mentioned below.</p><p>Initial screening: investigators independently reviewed all responses, documenting exemplar responses that demonstrated exceptional performance or significant deficiencies in accuracy, rigor, applicability, logical coherence, conciseness, or generalizability.</p><p>Theme identification: investigators consolidated preliminary observations and identified recurring themes through discussion (eg, &#x201C;Errors in Special Device Instructions,&#x201D; &#x201C;Lack of Localized Medication Recommendations,&#x201D; &#x201C;Failure to Identify Critical Contraindications,&#x201D; &#x201C;Overly Technical or Jargon-Rich or Verbose Language,&#x201D; and &#x201C;Ignoring Conflicting Gender-Specific Diagnostic Criteria&#x201D;).</p><p>Case selection: for each identified theme, representative response examples demonstrating the issue or strength were selected across different models. Priority was given to cases where quantitative assessment revealed performance variations and where the response content clearly illustrated the qualitative concern.</p><p>Content extraction and verification: key content was abstracted from selected cases to ensure descriptions accurately reflected the original response meaning (with critical verbatim excerpts cited where necessary). Final qualitative cases and their analyses were confirmed by consensus within the research team.</p><p>These qualitative findings supplement the quantitative results, providing deeper insight into model performance variations and potential risks across different clinical scenarios. They are reported and analyzed in detail in the Discussion section. The methodology flowchart is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Research methodology flow diagram. AI: artificial intelligence; ICC: intraclass correlation coefficient.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e76128_fig01.png"/></fig></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study constitutes a noninterventional evaluation of outputs from publicly accessible generative AI systems. It does not involve direct participation, intervention, or interaction with human participants (patients or healthy volunteers).</p><p>According to the policy of the Ethics Committee of Fuwai Central China Cardiovascular Hospital, this type of research (performance evaluation of publicly available AI systems using deidentified question banks and simulated scenarios) is qualified for exemption from formal ethics review application. The study design strictly followed the principles of the Declaration of Helsinki concerning nonbiomedical research.</p><p>This study exclusively used rigorously deidentified question texts as input stimuli for the AI systems. It did not involve the use, storage, or analysis of any raw patient data containing personally identifiable information. Therefore, additional informed consent for this secondary analysis was not required.</p><p>All question texts input into the AI systems were based on simulated scenarios or deidentified, generic inquiries, containing no real, personally identifiable patient information. The AI-generated output texts produced during the study contained only clinical pharmacy knowledge-related responses and similarly did not involve any private personal data. All research data (question bank, AI responses, and scoring sheets) were encrypted during storage and transmission and were accessible only to authorized researchers.</p><p>The 6 clinical pharmacists participating in the assessment received market-standard honoraria commensurate with their professional contribution.</p><p>This study did not use any images containing personally identifiable information (eg, faces, unique physical characteristics, and personal details). All results presentation figures (eg, <xref ref-type="fig" rid="figure1">Figures 1</xref><xref ref-type="fig" rid="figure2"/>-<xref ref-type="fig" rid="figure3">3</xref>) were generated based on aggregated statistical data and anonymized ICC values, posing no risk of individual identity disclosure.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Score distribution and differential analysis of artificial intelligence platforms across 4 pharmaceutical tasks (box plot). (Note: *<italic>P</italic>&#x003C;.05: significant difference, **<italic>P</italic>&#x003C;.01: highly significant difference, ***<italic>P</italic>&#x003C;.001: extremely significant difference, ****<italic>P</italic>&#x003C;&#x003C;.001: most significant difference).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e76128_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Rater consistency heat map (heat map of intraclass correlation coefficients).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e76128_fig03.png"/></fig></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Quantitative Evaluation of 8 Generative AI Dialogue Systems in Clinical Pharmacy Applications</title><sec id="s3-1-1"><title>Descriptive Statistics</title><p>DeepSeek-R1 demonstrated the strongest comprehensive capabilities, with particularly significant advantages in complex tasks (such as case analysis and pharmaceutical care). Qwen, GPT-4o, Claude-3.5-Sonnet, and Gemini-1.5-Pro performed exceptionally in certain tasks but were overall inferior to DeepSeek-R1. Doubao and Kimi showed inconsistent performance, while ERNIE Bot performed the poorest, indicating a need for targeted optimization. The SDs for scores in case analysis and pharmaceutical care tasks were relatively large, suggesting poorer consistency among clinical pharmacists when evaluating generative AI dialogue systems&#x2019; performance in handling these complex issues (see <xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Scores of 8 generative artificial intelligence (AI) dialogue systems across 4 problem types.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">AI platforms</td><td align="left" valign="bottom">Medication consultation (n=20), mean (SD)</td><td align="left" valign="bottom">Medication education (n=10), mean (SD)</td><td align="left" valign="bottom">Prescription audit (n=10), mean (SD)</td><td align="left" valign="bottom">Case analysis and pharmaceutical care (n=8), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">ERNIE Bot</td><td align="left" valign="top">7.3 (1.2)</td><td align="left" valign="top">8.4 (0.9)</td><td align="left" valign="top">7.3 (1.0)</td><td align="left" valign="top">6.8 (1.5)</td></tr><tr><td align="left" valign="top">Doubao</td><td align="left" valign="top">8.1 (1.1)</td><td align="left" valign="top">8.7 (1.3)</td><td align="left" valign="top">7.6 (1.2)</td><td align="left" valign="top">7.7 (1.7)</td></tr><tr><td align="left" valign="top">Kimi</td><td align="left" valign="top">8.0 (1.0)</td><td align="left" valign="top">9.1 (1.0)</td><td align="left" valign="top">8.5 (1.4)</td><td align="left" valign="top">8.0 (1.6)</td></tr><tr><td align="left" valign="top">Qwen</td><td align="left" valign="top">9.0 (0.8)</td><td align="left" valign="top">9.2 (0.7)</td><td align="left" valign="top">8.9 (0.9)</td><td align="left" valign="top">8.6 (1.2)</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">8.6 (1.0)</td><td align="left" valign="top">9.4 (1.1)</td><td align="left" valign="top">8.7 (1.3)</td><td align="left" valign="top">8.2 (1.4)</td></tr><tr><td align="left" valign="top">Gemini-1.5-Pro</td><td align="left" valign="top">8.8 (1.1)</td><td align="left" valign="top">9.1 (1.0)</td><td align="left" valign="top">9.2 (1.1)</td><td align="left" valign="top">8.1 (1.3)</td></tr><tr><td align="left" valign="top">Claude-3.5-Sonnet</td><td align="left" valign="top">9.1 (1.0)</td><td align="left" valign="top">9.4 (0.8)</td><td align="left" valign="top">9.2 (0.9)</td><td align="left" valign="top">8.3 (1.5)</td></tr><tr><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">9.4 (1.0)</td><td align="left" valign="top">9.5 (1.1)</td><td align="left" valign="top">9.4 (0.8)</td><td align="left" valign="top">9.3 (1.0)</td></tr></tbody></table></table-wrap></sec><sec id="s3-1-2"><title>Normality and Homogeneity of Variance Tests</title><p>Shapiro&#x2013;Wilk test showed that score data for all problem types conformed to normal distribution (<italic>P</italic>&#x003E;.05). Levene test indicated homogeneity of variance for medication consultation (<italic>P</italic>=.12), medication education (<italic>P</italic>=.09), prescription review (<italic>P</italic>=.15), and case analysis (<italic>P</italic>=.10).</p></sec><sec id="s3-1-3"><title>ANOVA</title><p>Across all problem types, significant differences were observed in scores among different generative AI dialogue systems (<italic>P</italic>&#x003C;.001), with effect size &#x03B7;&#x00B2;&#x003E;0.3, indicating statistically significant differences (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>ANOVA results (one-way, <italic>&#x03B1;</italic>=.05).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question type</td><td align="left" valign="bottom">F (<italic>df</italic>) value</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">&#x03B7;&#x00B2; (effect size)</td><td align="left" valign="bottom">Significance conclusion</td></tr></thead><tbody><tr><td align="left" valign="top">Medication consultation</td><td align="left" valign="top">14.3 (7, 44)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.36</td><td align="left" valign="top">Significant differences exist</td></tr><tr><td align="left" valign="top">Medication education</td><td align="left" valign="top">12.8 (7, 44)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.32</td><td align="left" valign="top">Significant differences exist</td></tr><tr><td align="left" valign="top">Prescription audit</td><td align="left" valign="top">16.5 (7, 44)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.40</td><td align="left" valign="top">Significant differences exist</td></tr><tr><td align="left" valign="top">Case analysis and pharmaceutical care</td><td align="left" valign="top">18.1 (7, 44)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.43</td><td align="left" valign="top">Significant differences exist</td></tr></tbody></table></table-wrap></sec><sec id="s3-1-4"><title>Multiple Comparisons (Tukey HSD)</title><p>DeepSeek-R1 demonstrated the best overall performance across all 4 task categories, with particularly significant advantages in prescription review, case analysis, and pharmaceutical care tasks. Qwen and GPT-4o showed similar performance in most tasks with no significant differences. Kimi and Doubao exhibited significant gaps (<italic>P</italic>&#x003C;.05) in certain tasks when compared to DeepSeek-R1. ERNIE Bot consistently underperformed, with highly significant differences (<italic>P</italic>&#x003C;.001) compared with other models.</p><p>Specifically, for medication consultation, DeepSeek-R1 was extremely significantly superior to ERNIE Bot (<italic>P</italic>&#x003C;.001). For medication education, most comparison results were not significant, indicating minimal differences among generative AI dialogue systems in this aspect. For prescription review, ERNIE Bot performed the poorest, showing significant (<italic>P</italic>&#x003C;.05) or extremely significant (<italic>P</italic>&#x003C;.001) differences when compared to all other generative AI dialogue systems. For case analysis and pharmaceutical care, ERNIE Bot remained significantly weaker than most models (eg, very significant difference compared to GPT-4o, <italic>P</italic>&#x003C;.01), while DeepSeek-R1 maintained stable performance with significant differences compared to all other models. See <xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Pairwise comparison results of 8 generative artificial intelligence (AI) dialogue systems across 4 question types (Tukey Honestly Significant Difference) only display the groups with significant differences.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="middle">Task and comparison of significant difference groups</td><td align="left" valign="middle" colspan="2">Mean difference (95% CI)</td><td align="left" valign="middle"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Medication consultation</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot-Qwen</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.7 (&#x2013;2.9 to &#x2013;0.5)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot-GPT-4o</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.3 (&#x2013;2.5 to &#x2013;0.1)</td><td align="char" char="." valign="middle">.03</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- Gemini-1.5-Pro</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.5 (&#x2013;2.7 to 0.27)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- Claude-3.5-Sonnet</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.7 (&#x2013;2.9 to &#x2013;0.5)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot-DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;2.1 (&#x2013;3.2 to &#x2013;0.8)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Doubao-DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.3 (&#x2013;2.4 to &#x2013;0.03)</td><td align="char" char="." valign="middle">.04</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Kimi-DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.4 (&#x2013;2.6 to &#x2013;0.1)</td><td align="char" char="." valign="middle">.01</td></tr><tr><td align="left" valign="top" colspan="4">Prescription audit</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- Kimi</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.3 (&#x2013;2.5 to &#x2013;0.1)</td><td align="char" char="." valign="middle">.04</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot-Qwen</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.6 (&#x2013;2.8 to &#x2013;0.4)</td><td align="char" char="." valign="middle">.002</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot-GPT-4o</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.4 (&#x2013;2.6 to &#x2013;0.19)</td><td align="char" char="." valign="middle">.01</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- Gemini-1.5-Pro</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.8 (&#x2013;3.1 to &#x2013;0.7)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- Claude-3.5-Sonnet</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.8 (&#x2013;3.1 to &#x2013;0.7)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot-DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;2.1 (&#x2013;2.3 to 0.09)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Doubao-Qwen</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.3 (&#x2013;2.5 to &#x2013;0.1)</td><td align="char" char="." valign="middle">.02</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Doubao- Gemini-1.5-Pro</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.6 (&#x2013;2.8 to &#x2013;0.4)</td><td align="char" char="." valign="middle">.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Doubao - Claude-3.5-Sonnet</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.6 (&#x2013;2.8 to &#x2013;0.4)</td><td align="char" char="." valign="middle">.002</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Doubao-DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.9 (&#x2013;3.1 to &#x2013;0.68)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="4">Case analysis and pharmaceutical care</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- Kimi</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.2 (&#x2013;2.4 to &#x2013;0.03)</td><td align="char" char="." valign="middle">.04</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot-Qwen</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.8 (&#x2013;3.0 to &#x2013;0.6)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- GPT-4o</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.5 (&#x2013;2.7 to &#x2013;0.2)</td><td align="char" char="." valign="middle">.007</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- Gemini-1.5-Pro</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.3 (&#x2013;2.5 to &#x2013;0.13)</td><td align="char" char="." valign="middle">.02</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- Claude-3.5-Sonnet</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.6 (&#x2013;2.8 to &#x2013;0.4)</td><td align="char" char="." valign="middle">.003</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ERNIE Bot- DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;2.6 (&#x2013;3.8 to &#x2013;1.4)</td><td align="char" char="." valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Doubao-DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.6 (&#x2013;2.8 to &#x2013;0.4)</td><td align="char" char="." valign="middle">.002</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Kimi-DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.3 (&#x2013;2.5 to &#x2013;0.1)</td><td align="char" char="." valign="middle">.02</td></tr><tr><td align="left" valign="middle"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gemini-1.5-Pro-DeepSeek-R1</td><td align="char" char="." valign="middle" colspan="2">&#x2013;1.2 (&#x2013;2.4 to &#x2013;0.02)</td><td align="char" char="." valign="middle">.043</td></tr></tbody></table></table-wrap></sec><sec id="s3-1-5"><title>Interrater Reliability (ICC Values)</title><p>The ICC values for all 6 dimensions in medication consultation, medication education, and prescription review were &#x003E;0.75, indicating good interrater reliability. Accuracy and logical coherence showed the highest consistency (ICC&#x003E;0.8), while conciseness in case analysis and pharmaceutical care demonstrated the lowest consistency (ICC=0.70), suggesting considerable disagreement among raters when evaluating generative AI dialogue systems&#x2019; solutions to complex problems (<xref ref-type="table" rid="table4">Table 4</xref>). A higher ICC value indicates stronger consistency, while a lower ICC value reflects weaker consistency. The darker red in <xref ref-type="fig" rid="figure3">Figure 3</xref> indicates higher consistency, while deeper blue indicates lower consistency.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Intraclass correlation coefficient values by dimension (2-way random effects model).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dimension</td><td align="left" valign="bottom">Medication consultation</td><td align="left" valign="bottom">Medication education</td><td align="left" valign="bottom">Prescription audit</td><td align="left" valign="bottom">Case analysis and pharmaceutical care</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">0.88</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.82</td><td align="left" valign="top">0.79</td></tr><tr><td align="left" valign="top">Preciseness</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.75</td></tr><tr><td align="left" valign="top">Applicability</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.79</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.72</td></tr><tr><td align="left" valign="top">Logicality</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.77</td></tr><tr><td align="left" valign="top">Conciseness</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.70</td></tr><tr><td align="left" valign="top">Universality</td><td align="left" valign="top">0.82</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.74</td></tr></tbody></table></table-wrap></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>AI applications in health care demonstrate enormous development potential while facing numerous challenges [<xref ref-type="bibr" rid="ref11">11</xref>]. This study, through constructing a 6-dimensional evaluation system, systematically compares the application effects and limitations of current mainstream conversational AI platforms, providing important evidence for further optimization of intelligent pharmaceutical systems.</p></sec><sec id="s4-2"><title>Key Findings</title><p>This study systematically evaluated the comprehensive performance of 8 mainstream generative AI systems across 4 core clinical pharmacy scenarios: medication consultation, medication education, prescription review, and case analysis with pharmaceutical care. The principal findings are as follows.</p><sec id="s4-2-1"><title>Significant Model Performance Stratification</title><p>DeepSeek-R1 demonstrated superior performance across all 4 task categories (composite score: medication consultation 9.4, SD 1.0, case analysis 9.3, SD 1.0), exhibiting particularly pronounced advantages in complex tasks such as case analysis with pharmaceutical care (<italic>P</italic>&#x003C;.05).</p><p>Second-tier models (Qwen, GPT-4o, Claude-3.5-Sonnet, and Gemini-1.5-Pro) performed well in specific scenarios but consistently underperformed DeepSeek-R1 overall (eg, Gemini-1.5-Pro scored 9.2, SD 1.1 in prescription review vs DeepSeek-R1&#x2019;s 9.4, SD 0.8).</p><p>Doubao and Kimi exhibited significant performance variability, while ERNIE Bot consistently lagged significantly across all tasks (eg, case analysis score=6.8, SD 1.5; <italic>P</italic>&#x003C;.001 difference vs DeepSeek-R1).</p></sec><sec id="s4-2-2"><title>Critical Limitations Exposed</title><p>Static knowledge bases and lack of localization: most models failed to adapt to Chinese clinical practices (eg, 90% incorrectly recommended macrolides for drug-resistant <italic>Mycoplasma pneumoniae</italic> infection; only DeepSeek-R1 correctly advised short-term doxycycline based on American Academy of Pediatrics [AAP] guidelines).</p><p>High-risk decision blind spots: models frequently overlooked critical contraindications (eg, only 25% recognized ethambutol is contraindicated in optic neuritis) and special regulatory requirements (eg, no model warned about the violation of prescribing diazepam beyond 7 d).</p><p>Complex reasoning deficiencies: in case analysis tasks, models struggled to integrate multidimensional information to formulate individualized care plans (eg, only Claude-3.5-Sonnet identified the diagnostic contradiction of &#x201C;benign prostatic hyperplasia in a female patient&#x201D;).</p><p>These limitations highlight 3 major technical bottlenecks: lagging knowledge base updates, insufficient complex decision-making reasoning, and prompt sensitivity risks (section &#x201C;Technical Bottlenecks and Clinical Challenges&#x201D; for details).</p></sec><sec id="s4-2-3"><title>Rater Consistency Challenge</title><p>The &#x201C;Conciseness&#x201D; dimension in case analysis tasks exhibited the lowest interrater reliability (ICC=0.70), reflecting inconsistent assessment standards for complex problems.</p><p>Therefore, while current generative AI can serve as an auxiliary reference tool for clinical pharmacists, its error rate in high-risk decisions (eg, omission of contraindications) and lack of localization capability preclude its use as an independent basis for clinical decision-making.</p></sec><sec id="s4-2-4"><title>Technical Bottlenecks and Clinical Challenges</title><sec id="s4-2-4-1"><title>Knowledge Base Limitations and Lag in Dynamic Update</title><p>This study found that all 8 generative AI dialogue systems shared a common deficiency in the areas of medication consultation and medication education: a lack of comprehensive disease assessment capabilities, making it difficult for them to provide personalized medication guidance. For instance, their guidance on the use of special devices was often inaccurate. When queried about the proper use of &#x201C;Budesonide and Formoterol Inhaler,&#x201D; most systems failed to accurately identify the specific type of device involved. Instead, they incorrectly described the usage instructions for other devices (eg, Salbutamol Aerosol), potentially misleading users.</p><p>Regarding the question &#x201C;Can a 6-year-old child with <italic>Mycoplasma pneumoniae</italic> infection take doxycycline?&#x201D; only DeepSeek-R1 provided a comprehensive answer incorporating the latest guidelines, &#x201C;The risk of tooth discoloration from short-term doxycycline therapy (&#x2264;21 days) is extremely low, and organizations such as the AAP have relaxed relevant restrictions. Clinicians may consider using doxycycline in the following circumstances: when macrolides are resistant or ineffective; when the child&#x2019;s condition is severe (eg, persistent high fever and lung consolidation); when no other safe alternative medications are available.&#x201D; Other generative AI dialogue systems still only recommended macrolide antibiotics, which are unsuitable for China&#x2019;s environment, where <italic>Mycoplasma pneumoniae</italic> has high resistance to macrolide antibacterial drugs. This is closely related to training data relying on static knowledge bases and lacking real-time, evidence-based pharmaceutical knowledge updates. DeepSeek, by leveraging publicly available open-source datasets to facilitate continuous learning, can enhance adaptability to evolving medical knowledge and scientific reasoning [<xref ref-type="bibr" rid="ref12">12</xref>]. Nevertheless, DeepSeek-R1 also demonstrated notable limitations, including overly specialized terminology and a lack of concise expression. Particularly when responding to simple medication inquiries, the answers were excessively complex and lengthy, making it more suitable as a reference tool for health care professionals rather than for general users.</p><p>In addition, semantic ambiguity issues in Chinese contexts (as shown in the &#x201C;prostate hyperplasia&#x201D; misdiagnosis case) highlight the deficiencies in localized medical knowledge base construction. Furthermore, the international models (GPT-4o, Gemini-1.5-Pro, and Claude-3.5-Sonnet) exhibit biases in their understanding of culturally sensitive issues. For example, when queried about the risks of concomitant use of Chinese herbal injections and Warfarin, these models may overlook the impact of <italic>CYP2C9</italic> gene polymorphisms prevalent in East Asian populations. Addressing such culture-gene interactions necessitates optimization through localized training frameworks. Research indicates an increasingly evident trend of using transformer-based language models in various natural language processing models in the medical field [<xref ref-type="bibr" rid="ref13">13</xref>], emphasizing the necessity of developing multilevel language training frameworks to adapt to professional medical environments [<xref ref-type="bibr" rid="ref14">14</xref>].</p></sec><sec id="s4-2-4-2"><title>Deficiencies in Complex Reasoning and Associated Ethical Risks</title><p>In prescription review, for a simple case &#x201C;16-year-old male patient diagnosed with periapical abscess [<xref ref-type="bibr" rid="ref15">15</xref>], prescribed levofloxacin tablets 0.2 g po bid, metronidazole tablets 0.6 g po tid, chlorhexidine acetate mouthwash 5 ml tid,&#x201D; only ERNIE Bot, Doubao, and Kimi failed to identify that quinolone antibiotics are contraindicated in patients younger than 18 years of age, while the other 5 systems successfully identified this key safety issue.</p><p>However, for more complex prescriptions, such as &#x201C;61-year-old male patient diagnosed with optic neuritis and tuberculous encephalopathy, prescribed: isoniazid 0.3 g qd, pyrazinamide 3 g biw, ethambutol tablets 0.75 g qd, rifampin capsules 0.6 g qd,&#x201D; only Qwen and DeepSeek-R1 accurately identified the critical contraindication that ethambutol could exacerbate visual impairment in patients with optic neuritis.</p><p>Notably, for the prescription &#x201C;40-year-old female patient diagnosed with insomnia, prescribed: diazepam tablets 10 mg po qn for 10 days&#x201D; none of the 8 generative AI dialogue systems accurately identified the special management requirements for diazepam as a Class II psychotropic medication&#x2014;that Class II psychotropic medications should not be prescribed for more than 7 days per prescription, and special circumstances require physician documentation. This regulatory requirement has significant importance in clinical pharmacists&#x2019; routine prescription review work.</p><p>Even more concerning, for the prescription &#x201C;71-year-old female patient diagnosed with hypertension and prostatic hyperplasia, prescribed: amlodipine tablets 5 mg qd, irbesartan and hydrochlorothiazide tablets 150 mg:12.5 mg qd,&#x201D; only Claude-3.5-Sonnet successfully identified the obvious error: the diagnosis was inconsistent with the patient&#x2019;s gender (females do not have prostates and cannot be diagnosed with prostatic hyperplasia). The other 7 generative AI dialogue systems failed to detect this serious error. This indicates that generative AI systems still have serious hallucinations and cognitive limitations in medical reasoning.</p><p>In the case analysis and pharmaceutical care aspect, this study required generative AI dialogue systems to assume the role of clinical pharmacists and analyze drug therapy regimens based on patient information including basic demographics, reason for consultation, present illness, past medical history, medication history, family history, allergies, adverse reaction history, unhealthy habits, diagnoses, current medication records, and laboratory examination results. They were also asked to develop a pharmaceutical care plan. These tasks closely simulate the professional work of clinical pharmacists during daily rounds, requiring comprehensive and in-depth analysis and assessment of cases to formulate medication monitoring plans tailored to individual patient characteristics.</p><p>The results indicated that all 8 generative AI systems faced significant challenges in executing these complex professional tasks, struggling to simultaneously ensure both accuracy and comprehensiveness in their responses. This reflects the current limitations of AI systems in integrating diverse clinical information and making professional pharmaceutical decisions, particularly in complex clinical scenarios requiring consideration of multiple factors. This relates to AI&#x2019;s lack of clinical contextual reasoning and insufficient understanding of complex instructions.</p><p>A recent research review indicates that AI has higher error rates when integrating multimodal data (such as laboratory indicators and imaging results) compared to single-modality tasks [<xref ref-type="bibr" rid="ref16">16</xref>]. Before resolving these technical bottlenecks, we cannot avoid the issue of accountability for AI-driven decisions [<xref ref-type="bibr" rid="ref17">17</xref>]. How legal responsibility should be defined when patients experience adverse reactions due to incorrect AI recommendations remains unclear with no explicit regulations currently issued [<xref ref-type="bibr" rid="ref18">18</xref>]. Therefore, AI dialogue systems should be positioned as &#x201C;augmented intelligence&#x201D; tools, establishing &#x201C;human-machine co-review&#x201D; mechanisms [<xref ref-type="bibr" rid="ref19">19</xref>], strengthening human supervision, and ensuring pharmacists retain final decision-making authority over AI outputs. The ethical risks associated with AI identified in this study strongly align with the medical AI regulatory requirements emphasized in the European Union AI Act [<xref ref-type="bibr" rid="ref20">20</xref>].</p></sec><sec id="s4-2-4-3"><title>Instruction Adherence and Stability Issues</title><p>This study used a single, independent query mode for evaluation, where each question was input to the model as an independent conversation. Under this mode, no instances were observed of the model significantly &#x201C;forgetting&#x201D; or ignoring core instructions (eg, &#x201C;act as a pharmacist,&#x201D; &#x201C;incorporate the latest clinical guidelines,&#x201D; and &#x201C;develop a pharmaceutical care plan&#x201D;) within a single response.</p><p>However, it is crucial to emphasize that this study did not test the stability of the model&#x2019;s instruction adherence across continuous, multiturn dialogues. In real-world clinical deployment, users may engage in sustained conversations with an AI system involving multiple questions. Existing technical observations report that generative AI carries a risk of instruction drift or context forgetting during extended conversations or multiturn interactions. This could lead to subsequent responses deviating from the initially specified role or requirements, potentially compromising the reliability and safety of the answers.</p><p>Therefore, future research should design dedicated testing protocols to evaluate model instruction consistency in multiturn dialogues. Furthermore, exploring methods to enhance stability through prompt engineering optimization (eg, periodic instruction reinforcement) or model fine-tuning is warranted.</p></sec></sec></sec><sec id="s4-3"><title>Compound Error Risk</title><p>While the scoring criteria in this study focused on explicit errors (eg, omission of contraindications), they did not detect potentially hazardous information embedded within otherwise correct responses. For instance, an AI model correctly identified a drug-drug interaction but ambiguously advised to &#x201C;monitor during use&#x201D; without explicitly stating the need for immediate discontinuation. In pediatric dosing advice, a model correctly recommended a drug dose but failed to emphasize the necessity of weight-based adjustment. Such errors carry a risk of clinical misinterpretation. Therefore, future research may need to develop algorithms for detecting latent risks, such as real-time validation based on databases like the FDA Adverse Event Reporting System (FAERS).</p></sec><sec id="s4-4"><title>Application Opportunities and Optimization Pathways</title><sec id="s4-4-1"><title>Personalized Medication Decision Support</title><p>The DeepSeek-R1 model demonstrated good guideline adherence in pediatric doxycycline medication recommendations (incorporating the latest AAP recommendations), highlighting the potential application of AI in personalized medication decision support [<xref ref-type="bibr" rid="ref21">21</xref>]. Relevant research confirms that AI can help formulate interventions under the guidance of predictive models by forecasting individual responses to treatment and monitoring patient progression, thereby modifying individualized treatment plans [<xref ref-type="bibr" rid="ref22">22</xref>]. However, it should be noted that AI still lags significantly behind human experts in providing individualized treatment plans. For instance, Marcaccini et al [<xref ref-type="bibr" rid="ref23">23</xref>] discovered that while AI-driven models demonstrate strong diagnostic accuracy and readability, further refinements are needed to improve treatment specificity and personalization. Looking forward, constructing an &#x201C;AI pharmaceutical knowledge graph&#x201D; [<xref ref-type="bibr" rid="ref24">24</xref>] that correlates individual metabolic characteristics (such as CYP450 enzyme phenotypes) with pharmacokinetic data could provide dynamic dosing optimization strategies based on the latest evidence-based evidence for clinical practice.</p></sec><sec id="s4-4-2"><title>Process Automation and Resource Optimization</title><p>Compared to clinical pharmacists, generative AI dialogue systems possess powerful information retrieval, data integration, and conversation capabilities [<xref ref-type="bibr" rid="ref25">25</xref>]. Natural language processing technology can automatically extract and analyze data from large volumes of electronic medical records [<xref ref-type="bibr" rid="ref26">26</xref>], which will greatly reduce manual time consumption, and the application of this technology is expected to significantly reduce the burden on clinical pharmacists in routine documentation such as medication history records. By communicating with users through verbal or nonverbal means, simulated pharmacist assistants have achieved improved medication adherence by providing medication education to older patients with diabetes [<xref ref-type="bibr" rid="ref27">27</xref>], all of which enables more optimized resource allocation.</p></sec><sec id="s4-4-3"><title>Medical Education and Skills Training</title><p>Generative AI dialogue systems provide new tools for clinical educational interventions and medical practice [<xref ref-type="bibr" rid="ref28">28</xref>], bringing new dimensions of personalized learning, enhanced visualization, and simulation-based clinical training to the forefront [<xref ref-type="bibr" rid="ref29">29</xref>]. In addition, AI-driven simulations offer realistic immersive training opportunities that prepare students for complex clinical situations and cultivate the interprofessional collaboration skills essential for modern health care [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec><sec id="s4-4-4"><title>Comparison With Previous Work</title><p>This study engages in significant dialogue with and extends existing literature in terms of both methodology and findings.</p><sec id="s4-4-4-1"><title>Deepened Evaluation Framework</title><p>Compared to the 3-dimensional evaluation frameworks proposed by other researchers, this study innovatively constructs a &#x201C;six-dimensional evaluation system,&#x201D; providing a more comprehensive capture of AI efficacy in pharmaceutical practice. Unlike single-model studies [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>], this work presents the first cross-model comparison of 8 mainstream AI systems, revealing significant performance stratification (eg, DeepSeek-R1 significantly outperformed GPT-4o in Case Analysis tasks, <italic>P</italic>&#x003C;.05).</p></sec><sec id="s4-4-4-2"><title>Validation and Extension of Key Limitations</title><p>This study confirms the warning highlighted in the literature [<xref ref-type="bibr" rid="ref15">15</xref>]. AI error rates remain high in complex medication decision-making (eg, a contraindication omission rate of 75% in Prescription Review tasks). Furthermore, semantic ambiguities unique to the Chinese context (eg, the gender contradiction in &#x201C;benign prostatic hyperplasia&#x201D;) resulted in higher error rates compared with monolingual settings, underscoring the urgency for cross-lingual training frameworks.</p></sec><sec id="s4-4-4-3"><title>Coverage of Innovative Scenarios</title><p>In contrast to studies [<xref ref-type="bibr" rid="ref3">3</xref>] limited to information retrieval, this research systematically validates, for the first time, AI performance across a continuous clinical decision-making chain (eg, from prescription review to pharmaceutical care plan development). This reveals deficiencies in multimodal data integration and contextual reasoning (resulting in higher error rates compared to single-task evaluations).</p></sec><sec id="s4-4-4-4"><title>Incremental Contribution of This Study</title><p>Through its systematic evaluation across multiple models, diverse scenarios, and 6 dimensions, this study provides an empirical foundation for the localized adaptation pathways (eg, dynamic knowledge graph updating) and ethical deployment boundaries (eg, a &#x201C;human-AI co-review&#x201D; mechanism) of generative AI in clinical pharmacy. These findings call for the establishment of cross-lingual training frameworks and continuous evaluation systems.</p></sec></sec></sec><sec id="s4-5"><title>Limitations and Future Directions</title><sec id="s4-5-1"><title>Scope of Evaluated Scenarios</title><p>While this study focused on 4 core scenarios in clinical pharmacy practice, it did not comprehensively cover all potential application domains, such as pharmacovigilance signal mining, pharmacoeconomic evaluation, and public health emergency scenarios. Future studies should expand the scope of evaluation, for instance, by designing emergency medication test sets to assess model reliability under limited evidence.</p></sec><sec id="s4-5-2"><title>Sample Size and Complexity</title><p>Although 48 questions were included, the number of Case Analysis and Pharmaceutical Care tasks was relatively low (only 8 questions). Furthermore, the complexity of the questions may still be insufficient to fully reflect the models&#x2019; capability to handle extremely complex, rare, or multisystem real-world cases. Future research should increase sample size, incorporate more diverse and higher-complexity real-world cases, and consider supplementing data using Standardized Patient Data or synthetic data generation techniques.</p></sec><sec id="s4-5-3"><title>Prompt Sensitivity</title><p>Generative AI systems exhibited significant sensitivity to the wording and structure of input prompts (Prompt Sensitivity). Minor variations in question phrasing (eg, adjusting keyword order, adding or removing qualifiers) could lead to divergent responses. While this study mitigated this variability through standardized instruction templates, it did not systematically quantify the impact of this sensitivity on the results. In addition, the 3 international models (GPT-4o, Gemini-1.5-Pro, and Claude-3.5-Sonnet) may have limited comprehension of Chinese medical terminology. Future research could use joint embedding models to reduce semantic bias or use adversarial prompt testing to evaluate model robustness and optimize instruction design.</p></sec><sec id="s4-5-4"><title>Model Dynamic Updates</title><p>AI models undergo rapid iteration and updates (eg, Gemini 1.5 Pro updates monthly). This study reflects the performance of specific model versions at a fixed point in time (February 20, 2025), and findings may change with subsequent model updates. There is an urgent need to establish continuous, dynamic evaluation frameworks and benchmarks to track the evolution of model performance.</p></sec><sec id="s4-5-5"><title>Insufficient Visualization of Results</title><p>This study recorded and summarized the total scores for each model across the 4 task types. However, it did not calculate the average scores for each model across the 6 evaluation dimensions (spanning all 4 task types). Consequently, it cannot intuitively display each model&#x2019;s relative strengths and weaknesses across dimensions or facilitate cross-model performance comparisons by dimension (eg, using radar charts). Future evaluations should incorporate these metrics.</p></sec><sec id="s4-5-6"><title>Lack of Real-World Impact Assessment</title><p>The study evaluated model output quality in a controlled environment but did not assess the actual impact on pharmacist workflow efficiency, decision-making quality, or patient outcomes in real-world clinical settings. Future research should conduct prospective implementation studies or randomized controlled trials.</p></sec></sec><sec id="s4-6"><title>Conclusions</title><p>This study conducted a systematic evaluation and comparative analysis of the application efficacy of 8 mainstream domestic and international generative AI systems across 4 core clinical pharmacy practice scenarios by constructing a 6-dimensional evaluation system. The results demonstrate that DeepSeek-R1 outperformed other models in overall performance, exhibiting particularly significant advantages in handling complex case analysis and pharmaceutical care tasks. However, all models exhibited limitations, prominently manifested in lagging knowledge base updates (eg, incorrect instructions for special devices and lack of localized recommendations), insufficient complex decision-making reasoning capabilities (eg, failure to identify critical medication contraindications and special regulatory requirements), sensitivity to prompt instructions, and overly technical terminology in outputs. Interrater reliability analysis revealed substantial disagreement in evaluating the conciseness dimension for complex tasks such as case analysis.</p><p>Based on these findings, this study concludes that while current generative AI systems demonstrate significant potential for efficiency gains and value as decision-support tools in clinical pharmacy, their responses still contain non-negligible errors and limitations, particularly at high-risk decision points. Therefore, at this stage, they should be strictly positioned as auxiliary reference tools for clinical pharmacists, not as independent bases for clinical decision-making. Future development should focus on overcoming key bottlenecks, including achieving dynamic knowledge updating and localization adaptation, enhancing reasoning capabilities in complex scenarios, improving prompt robustness and output interpretability, and establishing continuous evaluation mechanisms. Ultimately, safe, reliable, and patient-centered intelligent pharmaceutical care systems should be built through interdisciplinary collaboration integrating evidence-based medicine, ethical norms, and technological innovation.</p></sec></sec></body><back><ack><p>This research was funded by Henan Province Medical Science and Technology Research and Joint Construction Project (grant numbers 20220125 and 20230129).</p></ack><notes><sec><title>Data Availability</title><p>All data produced in this study are available upon reasonable request to the authors.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AAP</term><def><p>American Academy of Pediatrics</p></def></def-item><def-item><term id="abb2">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb3">CHA</term><def><p>Chinese Hospital Association</p></def></def-item><def-item><term id="abb4">CMA</term><def><p>Chinese Medical Association</p></def></def-item><def-item><term id="abb5">FAERS</term><def><p>FDA Adverse Event Reporting System</p></def></def-item><def-item><term id="abb6">HSD</term><def><p>Honestly Significant Difference</p></def></def-item><def-item><term id="abb7">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb8">MTM</term><def><p> Medication Therapy Management</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hatem</surname><given-names>NAH</given-names> </name></person-group><article-title>Advancing pharmacy practice: the role of intelligence-driven pharmacy practice and the emergence of pharmacointelligence</article-title><source>Integr Pharm Res Pract</source><year>2024</year><volume>13</volume><fpage>139</fpage><lpage>153</lpage><pub-id pub-id-type="doi">10.2147/IPRP.S466748</pub-id><pub-id pub-id-type="medline">39220215</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>44</fpage><lpage>56</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id><pub-id pub-id-type="medline">30617339</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al-Ashwal</surname><given-names>FY</given-names> </name><name name-style="western"><surname>Zawiah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gharaibeh</surname><given-names>L</given-names> </name><name name-style="western"><surname>Abu-Farha</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bitar</surname><given-names>AN</given-names> </name></person-group><article-title>Evaluating the sensitivity, specificity, and accuracy of ChatGPT-3.5, ChatGPT-4, Bing AI, and Bard against conventional drug-drug interactions clinical tools</article-title><source>Drug Healthc Patient Saf</source><year>2023</year><volume>15</volume><fpage>137</fpage><lpage>147</lpage><pub-id pub-id-type="doi">10.2147/DHPS.S425858</pub-id><pub-id pub-id-type="medline">37750052</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moulaei</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yadegari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Baharestani</surname><given-names>M</given-names> </name><name name-style="western"><surname>Farzanbakhsh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sabet</surname><given-names>B</given-names> </name><name name-style="western"><surname>Reza Afrash</surname><given-names>M</given-names> </name></person-group><article-title>Generative artificial intelligence in healthcare: a scoping review on benefits, challenges and applications</article-title><source>Int J Med Inform</source><year>2024</year><month>08</month><volume>188</volume><fpage>105474</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105474</pub-id><pub-id pub-id-type="medline">38733640</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rawashdeh</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>AlRyalat</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Prasad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT and artificial intelligence in transplantation research: is it always correct?</article-title><source>Cureus</source><year>2023</year><month>07</month><volume>15</volume><issue>7</issue><fpage>e42150</fpage><pub-id pub-id-type="doi">10.7759/cureus.42150</pub-id><pub-id pub-id-type="medline">37602076</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Seng</surname><given-names>BJJ</given-names> </name><name name-style="western"><surname>Law</surname><given-names>JZF</given-names> </name><etal/></person-group><article-title>Artificial intelligence, ChatGPT, and other large language models for social determinants of health: current state and future directions</article-title><source>Cell Rep Med</source><year>2024</year><month>01</month><day>16</day><volume>5</volume><issue>1</issue><fpage>101356</fpage><pub-id pub-id-type="doi">10.1016/j.xcrm.2023.101356</pub-id><pub-id pub-id-type="medline">38232690</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kamel Rahimi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pienaar</surname><given-names>O</given-names> </name><name name-style="western"><surname>Ghadimi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Implementing AI in hospitals to achieve a learning health system: systematic review of current enablers and barriers</article-title><source>J Med Internet Res</source><year>2024</year><month>08</month><day>2</day><volume>26</volume><fpage>e49655</fpage><pub-id pub-id-type="doi">10.2196/49655</pub-id><pub-id pub-id-type="medline">39094106</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Estau</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name></person-group><article-title>Evaluating the performance of ChatGPT in clinical pharmacy: a comparative study of ChatGPT and clinical pharmacists</article-title><source>Br J Clin Pharmacol</source><year>2024</year><month>01</month><volume>90</volume><issue>1</issue><fpage>232</fpage><lpage>238</lpage><pub-id pub-id-type="doi">10.1111/bcp.15896</pub-id><pub-id pub-id-type="medline">37626010</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Syrowatka</surname><given-names>A</given-names> </name><name name-style="western"><surname>Song</surname><given-names>W</given-names> </name><name name-style="western"><surname>Amato</surname><given-names>MG</given-names> </name><etal/></person-group><article-title>Key use cases for artificial intelligence to reduce the frequency of adverse drug events: a scoping review</article-title><source>Lancet Digit Health</source><year>2022</year><month>02</month><volume>4</volume><issue>2</issue><fpage>e137</fpage><lpage>e148</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(21)00229-6</pub-id><pub-id pub-id-type="medline">34836823</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Szumilas</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ochmann</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zi&#x0119;ba</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Evaluation of AI-driven LabTest checker for diagnostic accuracy and safety: prospective cohort study</article-title><source>JMIR Med Inform</source><year>2024</year><month>08</month><day>14</day><volume>12</volume><fpage>e57162</fpage><pub-id pub-id-type="doi">10.2196/57162</pub-id><pub-id pub-id-type="medline">39149851</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Chauhan</surname><given-names>V</given-names> </name><name name-style="western"><surname>Devi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Patil</surname><given-names>S</given-names> </name></person-group><article-title>Artificial intelligence, the digital surgeon: unravelling its emerging footprint in healthcare - the narrative review</article-title><source>J Multidiscip Healthc</source><year>2024</year><volume>17</volume><fpage>4011</fpage><lpage>4022</lpage><pub-id pub-id-type="doi">10.2147/JMDH.S482757</pub-id><pub-id pub-id-type="medline">39165254</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Temsah</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alhasan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Altamimi</surname><given-names>I</given-names> </name><etal/></person-group><article-title>DeepSeek in healthcare: revealing opportunities and steering challenges of a new open-source artificial intelligence frontier</article-title><source>Cureus</source><year>2025</year><month>02</month><volume>17</volume><issue>2</issue><fpage>e79221</fpage><pub-id pub-id-type="doi">10.7759/cureus.79221</pub-id><pub-id pub-id-type="medline">39974299</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shaitarova</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zaghir</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lavelli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Krauthammer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rinaldi</surname><given-names>F</given-names> </name></person-group><article-title>Exploring the latest highlights in medical natural language processing across multiple languages: a survey</article-title><source>Yearb Med Inform</source><year>2023</year><month>08</month><volume>32</volume><issue>1</issue><fpage>230</fpage><lpage>243</lpage><pub-id pub-id-type="doi">10.1055/s-0043-1768726</pub-id><pub-id pub-id-type="medline">38147865</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ajit-Roger</surname><given-names>E</given-names> </name><name name-style="western"><surname>Moise</surname><given-names>A</given-names> </name><name name-style="western"><surname>Peralta</surname><given-names>C</given-names> </name><name name-style="western"><surname>Orishchak</surname><given-names>O</given-names> </name><name name-style="western"><surname>Daniel</surname><given-names>SJ</given-names> </name></person-group><article-title>Enhancing multilingual patient education: ChatGPT&#x2019;s accuracy and readability for SSNHL queries in English and Spanish</article-title><source>OTO Open</source><year>2024</year><volume>8</volume><issue>4</issue><fpage>e70048</fpage><pub-id pub-id-type="doi">10.1002/oto2.70048</pub-id><pub-id pub-id-type="medline">39664064</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Elendu</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Amaechi</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Elendu</surname><given-names>TC</given-names> </name><etal/></person-group><article-title>Ethical implications of AI and robotics in healthcare: a review</article-title><source>Medicine (Baltimore)</source><year>2023</year><month>12</month><day>15</day><volume>102</volume><issue>50</issue><fpage>e36671</fpage><pub-id pub-id-type="doi">10.1097/MD.0000000000036671</pub-id><pub-id pub-id-type="medline">38115340</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Lester</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>XJ</given-names> </name></person-group><article-title>Beyond binary decisions: evaluating the effects of AI error type on trust and performance in AI-assisted tasks</article-title><source>Hum Factors</source><year>2025</year><month>03</month><day>19</day><volume>2025</volume><fpage>187208251326795</fpage><pub-id pub-id-type="doi">10.1177/00187208251326795</pub-id><pub-id pub-id-type="medline">40104968</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maliha</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gerke</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>IG</given-names> </name><name name-style="western"><surname>Parikh</surname><given-names>RB</given-names> </name></person-group><article-title>Artificial intelligence and liability in medicine: balancing safety and innovation</article-title><source>Milbank Q</source><year>2021</year><month>09</month><volume>99</volume><issue>3</issue><fpage>629</fpage><lpage>647</lpage><pub-id pub-id-type="doi">10.1111/1468-0009.12504</pub-id><pub-id pub-id-type="medline">33822422</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Habli</surname><given-names>I</given-names> </name><name name-style="western"><surname>Lawton</surname><given-names>T</given-names> </name><name name-style="western"><surname>Porter</surname><given-names>Z</given-names> </name></person-group><article-title>Artificial intelligence in health care: accountability and safety</article-title><source>Bull World Health Organ</source><year>2020</year><month>04</month><day>1</day><volume>98</volume><issue>4</issue><fpage>251</fpage><lpage>256</lpage><pub-id pub-id-type="doi">10.2471/BLT.19.237487</pub-id><pub-id pub-id-type="medline">32284648</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tessler</surname><given-names>I</given-names> </name><name name-style="western"><surname>Wolfovitz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Livneh</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Advancing medical practice with artificial intelligence: ChatGPT in healthcare</article-title><source>Isr Med Assoc J</source><year>2024</year><month>02</month><volume>26</volume><issue>2</issue><fpage>80</fpage><lpage>85</lpage><pub-id pub-id-type="medline">38420977</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ducret</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wahal</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gruson</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Trustworthy artificial intelligence in dentistry: learnings from the EU AI Act</article-title><source>J Dent Res</source><year>2024</year><month>10</month><volume>103</volume><issue>11</issue><fpage>1051</fpage><lpage>1056</lpage><pub-id pub-id-type="doi">10.1177/00220345241271160</pub-id><pub-id pub-id-type="medline">39311453</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dickinson</surname><given-names>H</given-names> </name><name name-style="western"><surname>Teltsch</surname><given-names>DY</given-names> </name><name name-style="western"><surname>Feifel</surname><given-names>J</given-names> </name><etal/></person-group><article-title>The unseen hand: AI-based prescribing decision support tools and the evaluation of drug safety and effectiveness</article-title><source>Drug Saf</source><year>2024</year><month>02</month><volume>47</volume><issue>2</issue><fpage>117</fpage><lpage>123</lpage><pub-id pub-id-type="doi">10.1007/s40264-023-01376-3</pub-id><pub-id pub-id-type="medline">38019365</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kale</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wankhede</surname><given-names>N</given-names> </name><name name-style="western"><surname>Pawar</surname><given-names>R</given-names> </name><etal/></person-group><article-title>AI-driven innovations in Alzheimer&#x2019;s disease: integrating early diagnosis, personalized treatment, and prognostic modelling</article-title><source>Ageing Res Rev</source><year>2024</year><month>11</month><volume>101</volume><fpage>102497</fpage><pub-id pub-id-type="doi">10.1016/j.arr.2024.102497</pub-id><pub-id pub-id-type="medline">39293530</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marcaccini</surname><given-names>G</given-names> </name><name name-style="western"><surname>Seth</surname><given-names>I</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Management of burns: multi-center assessment comparing AI models and experienced plastic surgeons</article-title><source>J Clin Med</source><year>2025</year><month>04</month><day>29</day><volume>14</volume><issue>9</issue><fpage>3078</fpage><pub-id pub-id-type="doi">10.3390/jcm14093078</pub-id><pub-id pub-id-type="medline">40364114</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xiong</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name></person-group><article-title>A multimodal framework for improving in silico drug repositioning with the prior knowledge from knowledge graphs</article-title><source>IEEE/ACM Trans Comput Biol Bioinform</source><year>2022</year><volume>19</volume><issue>5</issue><fpage>2623</fpage><lpage>2631</lpage><pub-id pub-id-type="doi">10.1109/TCBB.2021.3103595</pub-id><pub-id pub-id-type="medline">34375284</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alowais</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Alghamdi</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Alsuhebany</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Revolutionizing healthcare: the role of artificial intelligence in clinical practice</article-title><source>BMC Med Educ</source><year>2023</year><month>09</month><day>22</day><volume>23</volume><issue>1</issue><fpage>689</fpage><pub-id pub-id-type="doi">10.1186/s12909-023-04698-z</pub-id><pub-id pub-id-type="medline">37740191</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miranda</surname><given-names>O</given-names> </name><name name-style="western"><surname>Kiehl</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Qi</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Enhancing post-traumatic stress disorder patient assessment: leveraging natural language processing for research of domain criteria identification using electronic medical records</article-title><source>BMC Med Inform Decis Mak</source><year>2024</year><month>06</month><day>4</day><volume>24</volume><issue>1</issue><fpage>154</fpage><pub-id pub-id-type="doi">10.1186/s12911-024-02554-8</pub-id><pub-id pub-id-type="medline">38835009</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>F&#x00E9;lix</surname><given-names>IB</given-names> </name><name name-style="western"><surname>Guerreiro</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Cavaco</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Development of a complex intervention to improve adherence to antidiabetic medication in older people using an anthropomorphic virtual assistant software</article-title><source>Front Pharmacol</source><year>2019</year><volume>10</volume><fpage>680</fpage><pub-id pub-id-type="doi">10.3389/fphar.2019.00680</pub-id><pub-id pub-id-type="medline">31281256</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Martinez-Franco</surname><given-names>AI</given-names> </name><name name-style="western"><surname>Sanchez-Mendiola</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mazon-Ramirez</surname><given-names>JJ</given-names> </name><etal/></person-group><article-title>Diagnostic accuracy in Family Medicine residents using a clinical decision support system (DXplain): a randomized-controlled trial</article-title><source>Diagnosis (Berl)</source><year>2018</year><month>06</month><day>27</day><volume>5</volume><issue>2</issue><fpage>71</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1515/dx-2017-0045</pub-id><pub-id pub-id-type="medline">29730649</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>M</given-names> </name></person-group><article-title>The role of AI in reshaping medical education: opportunities and challenges</article-title><source>Clin Teach</source><year>2025</year><month>04</month><volume>22</volume><issue>2</issue><fpage>e70040</fpage><pub-id pub-id-type="doi">10.1111/tct.70040</pub-id><pub-id pub-id-type="medline">39956546</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hamilton</surname><given-names>A</given-names> </name></person-group><article-title>Artificial intelligence and healthcare simulation: the shifting landscape of medical education</article-title><source>Cureus</source><year>2024</year><month>05</month><volume>16</volume><issue>5</issue><fpage>e59747</fpage><pub-id pub-id-type="doi">10.7759/cureus.59747</pub-id><pub-id pub-id-type="medline">38840993</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Questions 1-48.</p><media xlink:href="medinform_v13i1e76128_app1.doc" xlink:title="DOC File, 176 KB"/></supplementary-material></app-group></back></article>