<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="review-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v14i1e79039</article-id>
      <article-id pub-id-type="pmid">41481915</article-id>
      <article-id pub-id-type="doi">10.2196/79039</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Review</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Review</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Large Language Model–Based Virtual Patient Systems for History-Taking in Medical Education: Comprehensive Systematic Review</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Benis</surname>
            <given-names>Arriel</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wang</surname>
            <given-names>William</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chang</surname>
            <given-names>Eunsuk</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Dongliang</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-8675-3702</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Lebai Lutfi</surname>
            <given-names>Syaheerah</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Medical Informatics Department</institution>
            <institution>College of Medicine and Health Sciences</institution>
            <institution>Sultan Qaboos University</institution>
            <addr-line>PO Box 35</addr-line>
            <addr-line>Al Khoudh</addr-line>
            <addr-line>Al Seeb, 123</addr-line>
            <country>Oman</country>
            <phone>60 134526001</phone>
            <email>s.lutfi@squ.edu.om</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7349-0061</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Artificial Intelligence &#38; Software Engineering</institution>
        <institution>School of Computer Sciences</institution>
        <institution>Universiti Sains Malaysia</institution>
        <addr-line>Penang</addr-line>
        <country>Malaysia</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Medical Informatics Department</institution>
        <institution>College of Medicine and Health Sciences</institution>
        <institution>Sultan Qaboos University</institution>
        <addr-line>Al Seeb</addr-line>
        <country>Oman</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Syaheerah Lebai Lutfi <email>s.lutfi@squ.edu.om</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>2</day>
        <month>1</month>
        <year>2026</year>
      </pub-date>
      <volume>14</volume>
      <elocation-id>e79039</elocation-id>
      <history>
        <date date-type="received">
          <day>14</day>
          <month>6</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>11</day>
          <month>8</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>23</day>
          <month>10</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Dongliang Li, Syaheerah Lebai Lutfi. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 02.01.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2026/1/e79039" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models (LLMs), such as GPT-3.5 and GPT-4 (OpenAI), have been transforming virtual patient systems in medical education by providing scalable and cost-effective alternatives to standardized patients. However, systematic evaluations of their performance, particularly for multimorbidity scenarios involving multiple coexisting diseases, are still limited.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This systematic review aimed to evaluate LLM-based virtual patient systems for medical history-taking, addressing four research questions: (1) simulated patient types and disease scope, (2) performance-enhancing techniques, (3) experimental designs and evaluation metrics, and (4) dataset characteristics and availability.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Following PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) 2020, 9 databases were searched (January 1, 2020, to August 18, 2025). Nontransformer LLMs and non–history-taking tasks were excluded. Multidimensional quality and bias assessments were conducted.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>A total of 39 studies were included, screened by one computer science researcher under supervision. LLM-based virtual patient systems mainly simulated internal medicine and mental health disorders, with many addressing distinct single disease types but few covering multimorbidity or rare conditions. Techniques like role-based prompts, few-shot learning, multiagent frameworks, knowledge graph (KG) integration (top-k accuracy 16.02%), and fine-tuning enhanced dialogue and diagnostic accuracy. Multimodal inputs (eg, speech and imaging) improved immersion and realism. Evaluations, typically involving 10-50 students and 3-10 experts, demonstrated strong performance (top-k accuracy: 0.45-0.98, hallucination rate: 0.31%–5%, System Usability Scale [SUS] ≥80). However, small samples, inconsistent metrics, and limited controls restricted generalizability. Common datasets such as MIMIC-III (Medical Information Mart for Intensive Care-III) exhibited intensive care unit (ICU) bias and lacked diversity, affecting reproducibility and external validity.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Included studies showed moderate risk of bias, inconsistent metrics, small cohorts, and limited dataset transparency. LLM-based virtual patient systems excel in simulating multiple disease types but lack multimorbidity patient representation. KGs improve top-k accuracy and support structured disease representation and reasoning. Future research should prioritize hybrid KG-chain-of-thought architectures integrated with open-source KGs (eg, UMLS [Unified Medical Language System] and SNOMED-CT [Systematized Nomenclature of Medicine - Clinical Terms]), parameter-efficient fine-tuning, dialogue compression, multimodal LLMs, standardized metrics, larger cohorts, and open-access multimodal datasets to further enhance realism, diagnostic accuracy, fairness, and educational utility.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language model</kwd>
        <kwd>virtual patient</kwd>
        <kwd>medical education</kwd>
        <kwd>history-taking</kwd>
        <kwd>simulated patients</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Since 2020, large language models (LLMs) such as GPT-3.5 (OpenAI) [<xref ref-type="bibr" rid="ref1">1</xref>] and GPT-4 (OpenAI) [<xref ref-type="bibr" rid="ref2">2</xref>] have significantly enhanced virtual patient systems in medical education. Unlike traditional methods relying on resource-intensive standardized patients or high-fidelity simulators [<xref ref-type="bibr" rid="ref3">3</xref>], LLMs provide scalable, low-risk, and cost-effective solutions by simulating realistic patient interactions across a wide range of clinical scenarios, including internal medicine, mental health disorders, and surgical and orthopedic cases [<xref ref-type="bibr" rid="ref4">4</xref>]. This capability addresses key challenges in medical education, such as limited exposure to diverse clinical cases and the high costs of traditional simulation approaches.</p>
      <p>Early virtual patient systems, often based on models like BERT (Bidirectional Encoder Representations from Transformers), struggled to generate natural dialogues and adapt to complex clinical scenarios, limiting their effectiveness in medical training [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. In contrast, modern LLMs, leveraging prompt-based techniques for role and scenario customization, demonstrate improved contextual understanding, enabling clinically relevant responses. However, these systems still face challenges with hallucination, defined as the generation of factually incorrect or contextually irrelevant content, which may compromise the accuracy of medical history-taking in virtual patient simulations [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>To ensure effective clinical training, continuous improvement and validation of LLMs are essential to mitigate hallucination rate and ensure the reliability of generated information [<xref ref-type="bibr" rid="ref7">7</xref>]. Recent studies have incorporated techniques such as Supervised Fine-Tuning (SFT) [<xref ref-type="bibr" rid="ref9">9</xref>] and Retrieval-Augmented Generation (RAG) [<xref ref-type="bibr" rid="ref10">10</xref>] to enhance contextual adaptability and diagnostic accuracy, as measured by metrics like top-k accuracy and GTPA@k. However, the specific disease types simulated by LLMs, such as such as neurological and rheumatological or rare multiple disease types and potential gaps in simulation capabilities remain underexplored partly due to variations in experimental design and datasets.</p>
      <p>Despite progress, systematic comparative evaluations of these techniques in virtual patient systems are lacking. Existing literature reviews often broadly discuss LLMs in medical education without focusing on virtual patient history-taking, limiting insights into domain-specific challenges [<xref ref-type="bibr" rid="ref11">11</xref>]. Thus, a systematic literature review is critical to consolidate fragmented research, identify challenges, and guide future work.</p>
      <p>Previous systematic reviews have explored LLMs in medical education but lack specificity. For instance, Lucas et al [<xref ref-type="bibr" rid="ref11">11</xref>] reviewed LLMs’ implications for teaching effectiveness, ethics, and reliability but did not focus on virtual patient history-taking. García-Torres et al [<xref ref-type="bibr" rid="ref12">12</xref>] used a hybrid human-LLM methodology to evaluate virtual patients’ impact on clinical reasoning but provided limited technical details on prompt design, knowledge graph (KG) integration, or fine-tuning. Similarly, Fatima et al [<xref ref-type="bibr" rid="ref13">13</xref>] conducted a cross-disciplinary review of ChatGPT in research, clinical practice, education, and patient interaction but did not systematically analyze virtual patient history-taking. Recent empirical studies on LLM-powered virtual patients with automated feedback have prioritized educational outcomes over methodological comparisons [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>In contrast, this review specifically examines LLM-based virtual patient systems for clinical history-taking. It systematically analyzes prompt engineering, external knowledge integration, model fine-tuning, and evaluation strategies, synthesizing their implications for medical education and effective clinical training. By adopting this methodological focus, this work addresses gaps in prior reviews, which were either too broad or lacked technical depth.</p>
      <p>Given this research gap, a systematic literature review is essential to consolidate fragmented research and guide future studies. This paper addresses four core research questions (RQs), as outlined in <xref rid="figure1" ref-type="fig">Figure 1</xref>:</p>
      <list list-type="bullet">
        <list-item>
          <p>RQ1: what types of patients, conditions, or diseases, such as internal medicine or rare and multiple disease types, are simulated in LLM-based virtual patient systems?</p>
        </list-item>
        <list-item>
          <p>RQ2: what techniques do LLMs use to enhance medical history-taking capabilities in clinical interviews?</p>
        </list-item>
        <list-item>
          <p>RQ3: how are experimental designs structured to evaluate LLM-based virtual patient systems, and what evaluation metrics, such as top-k accuracy or System Usability Scale (SUS), are used?</p>
        </list-item>
        <list-item>
          <p>RQ4: what public datasets are available, and what are their characteristics for training, simulating, and evaluating medical history–taking in virtual patient systems?</p>
        </list-item>
      </list>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Overview of 4 research questions on large language model–based virtual patient systems, covering simulation types, enhancement techniques, evaluation strategies, and datasets.</p>
        </caption>
        <graphic xlink:href="medinform_v14i1e79039_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>This systematic literature review addresses these questions to guide researchers in understanding the potential, limitations, and ethical challenges of LLMs in virtual patient systems. It is critical for advancing effective clinical training tools in medical education. The paper is structured as follows, the first section introduces the background of virtual patient systems and LLM applications; the second section describes the literature search methodology and evaluation criteria; the third section presents results, analyzing effectiveness and challenges; the fourth section discusses findings in the context of existing literature; and the fifth section concludes with key findings and recommendations for future research.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>This study conducted a systematic literature review adhering to PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) 2020 guidelines. The research topic and target population were clearly defined, a comprehensive search strategy was formulated, and key search terms were identified. Search results were exported as CSV files via Mendeley (Mendeley Ltd) and Zotero (Corporation for Digital Scholarship) and manually screened. Abstracts were initially evaluated, followed by a comprehensive full-text assessment to confirm relevance and quality.</p>
      </sec>
      <sec>
        <title>Eligibility Criteria</title>
        <sec>
          <title>Population</title>
          <p>The population includes medical students and physicians engaged in medical history–taking. Artificial intelligence (AI)–based virtual patients refer to LLM-based systems simulating patients for training in history-taking. Only transformer-based LLMs (eg, GPT series) are included, excluding older models like BERT (Google AI) or pre-Transformer architectures. Studies focusing on anesthesia, emergency procedures, or intraoperative scenarios are excluded, as these are nontraditional history-taking contexts. Preoperative history-taking for surgical and orthopedic patients is included. This ensures a focus on transformer-based LLMs for history-taking, emphasizing educational and clinical skill development while excluding irrelevant clinical settings. Only studies where LLMs serve as virtual patients for active history-taking and patient communication are included, excluding those focused solely on medical interviews without history-taking.</p>
        </sec>
        <sec>
          <title>Intervention</title>
          <p>The intervention involves transformer-based LLM technologies applied to medical history-taking, including prompt design, model fine-tuning, KG integration, and other LLM-related techniques. These aim to enhance diagnostic accuracy, interaction quality, and overall performance in virtual patient systems.</p>
        </sec>
        <sec>
          <title>Outcomes</title>
          <p>Outcomes include performance metrics such as top-k accuracy (eg, top-1 accuracy), empathy scores, readability, system stability (eg, response consistency), user experience (eg, SUS, Chatbot Usability Questionnaire [CUQ]), and other relevant indicators, such as <italic>κ</italic> (Cohen κ) and <italic>P</italic> value. These metrics evaluate the effectiveness of transformer-based LLM virtual patient systems in medical history–taking.</p>
        </sec>
        <sec>
          <title>Inclusion and Exclusion Criteria</title>
          <p>Inclusion and exclusion criteria were established to ensure relevance and quality, as shown in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>. Inclusion criteria prioritize studies with technical depth, validated outcomes, recent publication, and a focus on Transformer-based LLM virtual patients for history-taking, excluding nontransformer models.</p>
          <boxed-text id="box1" position="float">
            <title>Inclusion and exclusion criteria for transformer-based large language model (LLM) virtual patient studies.</title>
            <p>
              <bold>Inclusion criteria:</bold>
            </p>
            <list list-type="bullet">
              <list-item>
                <p>IC1: the study population included medical students or physicians involved in medical history–taking or communication training.</p>
              </list-item>
              <list-item>
                <p>IC2: the intervention involved Transformer-based LLMs (eg, GPT series) used as virtual patients for history-taking or communication training in traditional consultation settings.</p>
              </list-item>
              <list-item>
                <p>IC3: the study was published between January 1, 2020, and August 18, 2025, including peer-reviewed articles and preprints.</p>
              </list-item>
              <list-item>
                <p>IC4: the study reported measurable outcomes related to diagnostic performance, communication effectiveness, empathy, readability, or user experience.</p>
              </list-item>
              <list-item>
                <p>IC5: preoperative history-taking scenarios (eg, surgical or orthopedic patients) were included if they involved conventional patient–clinician consultation processes.</p>
              </list-item>
            </list>
            <p>
              <bold>Exclusion Criteria:</bold>
            </p>
            <list list-type="bullet">
              <list-item>
                <p>EC1: based on earlier or nongenerative transformer models (eg, BERT [Bidirectional Encoder Representations from Transformers] and GPT-2) rather than modern LLMs.</p>
              </list-item>
              <list-item>
                <p>EC2: focused solely on medical interviews without explicit medical history–taking.</p>
              </list-item>
              <list-item>
                <p>EC3: published before January 2020.</p>
              </list-item>
              <list-item>
                <p>EC4: duplicate titles or redundant publications.</p>
              </list-item>
              <list-item>
                <p>EC5: review or commentary papers.</p>
              </list-item>
              <list-item>
                <p>EC6: non-English language studies.</p>
              </list-item>
              <list-item>
                <p>EC7: focused on anesthesia, emergency procedures, or intraoperative contexts (except preoperative consultations).</p>
              </list-item>
            </list>
          </boxed-text>
        </sec>
      </sec>
      <sec>
        <title>Information Sources</title>
        <p>A systematic literature search was conducted across 9 authoritative databases PubMed, Scopus, Web of Science, IEEE Xplore, ACM Digital Library, SpringerLink, ERIC, arXiv, and ACL Anthology, covering studies published between January 2020 and August 18, 2025, in medicine, AI, education, and virtual technologies. A secondary search used a snowballing strategy, identifying additional sources from 15 initial articles and relevant publications, including theses from ProQuest. Including peer-reviewed articles and grey literature (eg, preprints from arXiv and theses) ensured comprehensive coverage of the rapidly evolving fields of AI and medical education.</p>
      </sec>
      <sec>
        <title>Search Strategy</title>
        <p>Two search strategies were used for this systematic review. The first query combined virtual patient terms (eg, virtual patient, simulated patient, AI patient, conversational patient, chatbot patient, intelligent virtual agent, and dialogue agent) with LLM-related keywords (eg, LLM, ChatGPT, GPT-4, GPT, transformer model, generative AI, AI-powered tutor, and natural language generation) and was applied to structured databases including PubMed, Scopus, Web of Science, IEEE Xplore, and ACM Digital Library. The second, simplified query included only virtual patient terms and was applied to broader repositories, namely SpringerLink, ERIC, arXiv, and ACL Anthology, to ensure comprehensive coverage, as strict LLM keywords reduced results in these databases (see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for more details).</p>
      </sec>
      <sec>
        <title>Selection Process and Data Collection</title>
        <p>The literature screening and data collection process followed PRISMA 2020 guidelines [<xref ref-type="bibr" rid="ref14">14</xref>] (checklist provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Titles and abstracts were manually screened for relevance to LLM-based virtual patient history-taking, with GPT-3.5 Turbo used for auxiliary verification. Full-text evaluations were conducted for studies meeting preliminary inclusion criteria. A multidimensional quality assessment form recorded key study characteristics and outcomes, as detailed in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendices 3</xref> and <xref ref-type="supplementary-material" rid="app4">4</xref>. The ChatPDF tool was used to cross-check content, and final inclusion decisions were made with the supervising researcher’s input.</p>
      </sec>
      <sec>
        <title>Quality and Risk of Bias Assessment</title>
        <p>A reviewer with a computer science background, under the supervision of an experienced research advisor, conducted the quality assessment. A customized multidimensional assessment framework was developed to evaluate the technical quality of the included studies, as described in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendices 3</xref> and <xref ref-type="supplementary-material" rid="app4">4</xref>. Conventional appraisal tools such as the Joanna Briggs Institute Critical Appraisal Checklist (2020) were adapted because they were not fully suitable for LLM-based virtual patient research. The framework incorporated 6 evaluation dimensions methodology clarity, dataset transparency, completeness of system evaluation, innovation or integration level, reproducibility and openness, and the presence of control or baseline comparisons. Each dimension was rated on a 3-point scale ranging from 0 to 2, with 2 indicating the highest quality. The total score for each study, therefore, ranged from 0 to 12. Based on the overall score, studies were categorized into 3 quality levels, high (9-12 points), medium (5-8 points), and low (0-4 points). This classification ensured a consistent and transparent interpretation of the technical quality across studies. The assessment emphasized methodological rigor, technical implementation, system architecture, model training strategies, multimodal integration, and evaluation methods to identify potential sources of bias.</p>
        <p>Risk of bias was also assessed across 5 domains—selection or reporting bias, implementation bias, evaluation bias, data bias, and reporting completeness bias—to capture variations in study design, data transparency, technical implementation, and reporting quality. This combined approach ensured a systematic evaluation of both methodological soundness and potential bias in LLM-based virtual patient studies.</p>
        <p>Based on the aforementioned screening and evaluation methods, the research results will be presented in detail in the next section.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview</title>
        <p>This section reports the results of literature selection, study characteristics, and findings related to the 4 RQs, avoiding interpretive discussion.</p>
      </sec>
      <sec>
        <title>Study Selection Results</title>
        <p>During identification (see <xref rid="figure2" ref-type="fig">Figure 2</xref> for the 2020 flow diagram for systematic reviews, including database and register searches), 848 records were retrieved from 10 databases. During abstract screening, 672 records were excluded, and after removing 46 duplicates, 130 unique full-text articles were assessed. Of these, 66 were excluded for reasons including lack of extractable data (n=3), duplicate content (n=6), non-LLM models (n=13), focus not on LLM applications (n=5), nonroutine clinical settings (n=2), no history-taking focus (n=36), or nonrelevance to LLM technologies (n=1), leaving 64 eligible articles. During multidimensional evaluation, 20 studies were excluded for limited relevance, 11 for insufficient technical detail, and 4 for other reasons, yielding 29 articles from database searches. An additional 12 articles were identified through snowballing, of which 2 were excluded as irrelevant, resulting in 10 included snowballed studies. The final synthesis comprised a total of 39 studies [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref51">51</xref>] (6 high-quality, 33 moderate-quality) (see <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>PRISMA flow diagram.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e79039_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Quality and Risk of Bias Assessment Results</title>
        <p>Most studies achieved the highest rating (2 points) for methodological clarity, indicating well-defined research procedures and consistent experimental designs. Scores for dataset transparency and system evaluation completeness varied considerably, with approximately 45%-55% of studies receiving moderate ratings (1 point) due to insufficient details regarding data sources or evaluation frameworks. Innovation and integration levels were generally high, with about 80% of studies receiving high ratings (2 points), reflecting notable progress in multimodal integration and technological creativity. Reproducibility and openness received moderate ratings (1 point) in approximately 60% of studies, as some provided replication details whereas others lacked information on model configurations or training strategies. Control or baseline comparison obtained the lowest scores (0-1 point), with only 25%-30% of studies incorporating explicit comparative or controlled analyses. Overall, 6/39 (15%) studies were rated as “high quality,” and 33/39 (85%) as “moderate quality,” suggesting strong methodological design and innovation but highlighting the need for improved dataset transparency, evaluation completeness, and reproducibility (see <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> for details).</p>
        <p>Risk of bias was assessed across five domains—selection and reporting bias, implementation bias, evaluation bias, data bias, and reporting completeness bias—as illustrated in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Overview of risk of bias distribution across studies.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e79039_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Selection and reporting bias was evident in disease coverage. Approximately 95% (37/39) of studies simulated specific disease types (eg, internal medicine or mental health disorders), but only 50% (20/39) addressed multiple disease stages, thereby limiting generalizability to multimorbidity contexts.</p>
        <p>Implementation bias was observed in the limited reporting of model development details. While 80% (31/39) of studies described model types, only 35% (14/39) provided information on training procedures, fine-tuning processes, prompt engineering, reinforcement learning with human feedback (RLHF), domain-specific adaptation, or multimodal integration. This resulted in high technical heterogeneity.</p>
        <p>Evaluation bias was reflected in the uneven assessment of educational and user experience outcomes. Approximately 70% (27/39) of studies evaluated educational outcomes, and 65% (25/39) assessed user experience using instruments such as the SUS or the Chatbot Usability Questionnaire (CUQ). However, only 30% (12/39) of studies conducted comparative or controlled evaluations, which limited the interpretability of effectiveness findings.</p>
        <p>Data bias stemmed from moderate dataset transparency. Fewer than half of the studies (18/39, 45%) explicitly identified data sources or quality control procedures, constraining reproducibility.</p>
        <p>Reporting completeness bias was identified in the limited documentation of personalization mechanisms and quality control for generated outputs, reported in only 25%-30% (10-12/39) of studies, suggesting incomplete methodological reporting.</p>
        <p>In summary, this multidimensional quality and bias assessment systematically examined the technical features, methodological design, and potential biases of LLM-based virtual patient research. The findings highlight the need to improve dataset transparency, enhance the comprehensiveness of system evaluation, and include controlled comparative studies to strengthen reproducibility and scientific validity. Details of the screening results and full bias assessment are provided in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p>
      </sec>
      <sec>
        <title>Finding for RQ1: What Types of Patients Are Simulated in LLM-Based Virtual Patient Systems?</title>
        <p>This section presents a descriptive statistical analysis and classification of patient types simulated by LLMs in virtual patient history-taking systems, based on a systematic literature review. The classification uses disease categories from <xref ref-type="table" rid="table1">Table 1</xref>, grouping similar disease types (eg, internal medicine and mental health disorders). For each category, the number of studies, specific simulation scenarios, disease complexity (eg, low: single symptom; medium: multisystem involvement; high: rare or complex interactions), number of simulated cases (if specified), and disease stage (eg, acute and chronic) are summarized.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Classification of patient types simulated in virtual patient systems.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="250"/>
            <col width="100"/>
            <col width="100"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Disease category</td>
                <td>Simulation scenarios</td>
                <td>Complexity</td>
                <td>Number of cases</td>
                <td>Stage</td>
                <td>References</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Internal medicine</td>
                <td>Chest/abdominal pain, diabetes, COPD<sup>a</sup>, COVID-19, hypertension, multisymptom</td>
                <td>Medium-high</td>
                <td>3-500+</td>
                <td>Acute-chronic</td>
                <td>12 studies [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Mental health disorders</td>
                <td>Depression, PTSD<sup>b</sup>, ADHD<sup>c</sup>, TRD<sup>d</sup>, CBT<sup>e</sup> models</td>
                <td>High</td>
                <td>1-106</td>
                <td>Acute-chronic</td>
                <td>8 studies [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Surgical/orthopedic</td>
                <td>Plastic surgery, hand surgery, joint pain</td>
                <td>Medium</td>
                <td>3-10</td>
                <td>Acute–chronic</td>
                <td>4 studies [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Neurological/rheumatological</td>
                <td>Stroke, meningitis, concussion, polymyositis, brain hemorrhage</td>
                <td>High</td>
                <td>1-4</td>
                <td>Acute</td>
                <td>6 studies [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref39">39</xref>-<xref ref-type="bibr" rid="ref41">41</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Ophthalmological</td>
                <td>Eye conditions, pain with redness/photophobia</td>
                <td>Medium</td>
                <td>1-24</td>
                <td>Acute</td>
                <td>3 studies [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Dermatological</td>
                <td>Ear cyst, telogen effluvium, skin conditions</td>
                <td>Medium</td>
                <td>1-394</td>
                <td>Acute-chronic</td>
                <td>4 studies [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Rare/multiple disease types<sup>f</sup></td>
                <td>Rare diseases, unspecified conditions, broad patient scenarios</td>
                <td>High</td>
                <td>8-5230</td>
                <td>Diverse</td>
                <td>10 studies [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref45">45</xref>-<xref ref-type="bibr" rid="ref51">51</xref>]</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>COPD: chronic obstructive pulmonary disease.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>PTSD: posttraumatic stress disorder.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>ADHD: attention deficit hyperactivity disorder.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>TRD: treatment-resistant depression.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>CBT: cognitive behavioral therapy.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>Only [<xref ref-type="bibr" rid="ref25">25</xref>] involves rare disease simulation, using the RareBench dataset with 421 rare disease cases. Other references in the “rare/multiple disease types” category focus on multiple disease types or unspecified/broad patient scenarios.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Summary of Research Focus and Coverage</title>
        <p>A systematic descriptive analysis of the literature indicates that research primarily focuses on internal medicine, particularly gastrointestinal (eg, abdominal pain, heartburn, hematemesis [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]), respiratory (eg, cough, chronic obstructive pulmonary disease, COVID [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]), cardiovascular (eg, chest pain, hypertension [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]), metabolic and endocrine (eg, diabetes [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]), and fatigue-related conditions (eg, chronic fatigue [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]). These conditions are well-suited for effective clinical training, with LLMs effectively simulating both acute and chronic management scenarios, though vague symptoms like fatigue remain underexplored.</p>
        <p>Mental health disorders are a significant focus, including depression and related disorders (eg, suicidal ideation and treatment-resistant depression) [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref35">35</xref>], posttraumatic stress disorder (posttraumatic stress disorder; eg, combat trauma) [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], attention-deficit/hyperactivity disorder [<xref ref-type="bibr" rid="ref31">31</xref>], and other mental health conditions (eg, cognitive impairment, cognitive behavioral therapy [CBT] models) [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. These simulations emphasize emotional and psychological complexity, suitable for empathy and CBT training, though behavioral disorders like attention-deficit/hyperactivity disorder are less explored.</p>
        <p>Rare and multiple disease types are well represented, focusing on heterogeneous case collections and electronic health record (EHR)–driven simulations [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref51">51</xref>], as well as rare disease modeling (eg, 421 rare diseases [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]). These studies highlight the scalability and diversity of LLMs in simulating complex and varied clinical conditions.</p>
        <p>Less studied areas include neurological and rheumatological diseases (eg, stroke, meningitis, and polymyositis [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref39">39</xref>-<xref ref-type="bibr" rid="ref41">41</xref>]), dermatological diseases (eg, ear cyst, telogen effluvium, and skin conditions [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]), surgical and orthopedic diseases (eg, joint pain, plastic surgery, and hand surgery [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]), and ophthalmological diseases (eg, eye pain, photophobia, and multiple eye conditions [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]). These areas, due to specialized or visual simulation requirements, have received less attention, revealing significant research gaps.</p>
        <p>Overall, research focuses on internal medicine and mental health disorders due to their prevalence and clinical relevance. Rare and multiple disease types demonstrate LLMs’ scalability and generalization potential. In contrast, specialized domains such as surgical and orthopedic, neurological and rheumatological, dermatological, and ophthalmological diseases remain underexplored, presenting opportunities for further innovation in simulation-based medical education.</p>
      </sec>
      <sec>
        <title>Finding for RQ2: What Techniques do LLMs Use to Enhance Medical History–Taking Capabilities in Clinical Interviews?</title>
        <p>To address RQ2, LLMs use advanced techniques to enable realistic, dynamic, and accurate virtual patient simulations for medical history-taking, providing medical students and professionals with effective clinical training platforms that closely resemble real-world scenarios. The following sections categorize core LLM technologies for history-taking into prompt engineering, KGs and structured data, model fine-tuning and training, and speech interaction, as summarized in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Technical metrics, evaluation scores, and key strengths and limitations of LLM virtual patient techniques.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="270"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="80"/>
            <col width="0"/>
            <col width="80"/>
            <col width="0"/>
            <col width="170"/>
            <col width="0"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Category and technique (source)<sup>a</sup></td>
                <td colspan="2">Top-k accuracy (%)</td>
                <td colspan="2">Hallucination rate (%)</td>
                <td colspan="2">IRR<sup>b</sup> (%)</td>
                <td colspan="2">AS<sup>c</sup> (%)</td>
                <td colspan="2">Strengths</td>
                <td>Limitations</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="14">
                  <bold>Prompt design</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Role-based prompts [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]</td>
                <td colspan="2">81.4</td>
                <td colspan="2">4.97</td>
                <td colspan="2">2.08</td>
                <td colspan="2">28</td>
                <td colspan="2">Realistic role-play; multiagent;</td>
                <td colspan="2">Long prompts inconsistent; complex cases limited</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Few-shot [<xref ref-type="bibr" rid="ref25">25</xref>]</td>
                <td colspan="2">Not used: 25%, used: 52%</td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">Simple, effective;</td>
                <td colspan="2">Limited depth</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Multiagent prompt frameworks [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]</td>
                <td colspan="2">—<sup>d</sup></td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">Consistent dialogue</td>
                <td colspan="2">Higher computation</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>KG<sup>e</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Entity-relation triples [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">Structured triples</td>
                <td colspan="2">Small-scale realism</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Entity layering [<xref ref-type="bibr" rid="ref10">10</xref>]</td>
                <td colspan="2">97.42</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">High diagnostic accuracy and interpretability</td>
                <td colspan="2">Limited semantic layers</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>KG+history and few-shot [<xref ref-type="bibr" rid="ref10">10</xref>]</td>
                <td colspan="2">97.85</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">Role continuity</td>
                <td colspan="2">Context overflow</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>KG+multiagent prompts [<xref ref-type="bibr" rid="ref10">10</xref>]</td>
                <td colspan="2">97.85</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">Modular roles</td>
                <td colspan="2">High cost</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Fine-tuning</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SFT<sup>f</sup> [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
                <td colspan="2">—</td>
                <td colspan="2">(Before fine-tuning 3.71%, after fine-tuning 0.31%)</td>
                <td colspan="2">4.79</td>
                <td colspan="2">87.00</td>
                <td colspan="2">Reduces hallucination rate; better reasoning</td>
                <td colspan="2">Limited turn validation</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LoRA<sup>g</sup> [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">Efficient tuning</td>
                <td colspan="2">Limited scalability evidence</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Speech interaction</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>TTS<sup>h</sup> [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">Realistic voice; multimodal input</td>
                <td colspan="2">Recognition errors possible</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>All techniques listed are LLM-based Transformer architectures.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>IRR: information-related response rate.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>AS: Anthropomorphism Score.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>Not available.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>KG: knowledge graph.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>SFT: supervised fine-tuning.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>LoRA: low-rank adaptation.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>TTS: text-to-speech.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <sec>
          <title>Prompt Design for Realistic Patient Simulation</title>
          <list list-type="order">
            <list-item>
              <p>Role-based prompt design: detailed prompts embed patient demographics, personality traits (eg, Big Five), and clinical symptoms to simulate authentic patient behavior. For instance, Borg et al [<xref ref-type="bibr" rid="ref41">41</xref>] developed virtual patient cases with approximately 2000 tokens per prompt, including detailed medical histories and contextual information, occupying a significant portion of the LLM’s 4096-token context window. Implementations include structured prompts with role context, medical history, and behavioral constraints [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref43">43</xref>-<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. For example, Bodonhelyi et al [<xref ref-type="bibr" rid="ref37">37</xref>] used Satir model roles (eg, “accuser”') to achieve up to 95% dialogue consistency. This ensures consistent role-playing but requires instructor oversight to maintain medical accuracy [<xref ref-type="bibr" rid="ref36">36</xref>].</p>
            </list-item>
            <list-item>
              <p>Few-shot and dialogue history–based prompting: prompts incorporate dialogue history and few-shot examples to maintain conversational coherence and mimic gradual information disclosure. MEDDxAgent uses structured examples to guide diagnostic reasoning, improving top-k accuracy from 25% to 52% in complex cases [<xref ref-type="bibr" rid="ref25">25</xref>]. Limitations include token constraints, which may restrict handling of intricate scenarios.</p>
            </list-item>
            <list-item>
              <p>Multiagent prompt frameworks: multiple agents (eg, patient agent, doctor agent, and behavior controller) collaborate to generate realistic and personalized clinical dialogues, reducing reliance on single prompts and minimizing hallucination rate. For example, EvoPatient, developed by Du et al [<xref ref-type="bibr" rid="ref18">18</xref>], uses a patient–doctor dual-agent system for natural dialogue via unsupervised learning. LLM-based generative agents integrate memory flows, retrievers, and cognitive mechanisms to enhance dialogue realism and training effectiveness [<xref ref-type="bibr" rid="ref24">24</xref>]. Additionally, AI self-play agents such as AMIE simulate diagnostic conversations through internal and external self-play loops, applying multirole strategies within a single model to improve diversity and dialogue adaptation [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
            </list-item>
          </list>
        </sec>
        <sec>
          <title>Knowledge Graphs and Structured Data</title>
          <list list-type="order">
            <list-item>
              <p>Dynamic KG retrieval: MedDiT uses KG agents to retrieve relevant subgraphs via SPARQL queries, linearizing them into natural language prompts to reduce token load and context loss, achieving significant token reduction [<xref ref-type="bibr" rid="ref47">47</xref>].</p>
            </list-item>
            <list-item>
              <p>Hierarchical KG modeling: AI Patient integrates layered KG with a Reasoning RAG multiagent framework, boosting accuracy from 68.94% to 94.15%. For the difficult “family and social history” category, accuracy improved from 13.33% to 85.56%, showing the value of structured KG reasoning. Compared to role-based prompts, the KG-based method performs better in multiclass tasks. With entity layering [<xref ref-type="bibr" rid="ref10">10</xref>], accuracy rose by 16.02% (81.4% to 97.42%) in challenging categories such as allergy and social history.</p>
            </list-item>
          </list>
        </sec>
        <sec>
          <title>Model Fine-Tuning and Training</title>
          <list list-type="order">
            <list-item>
              <p>Instruction fine-tuning and self-play: AMIE uses real and simulated medical dialogues with self-play loops (inner and outer) to optimize diagnostic performance, outperforming primary care physicians (PCPs) [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref45">45</xref>].</p>
            </list-item>
            <list-item>
              <p>SFT and LoRA (Low-Rank Adaptation): Liu et al [<xref ref-type="bibr" rid="ref17">17</xref>] fine-tuned Qwen2.5-72B-Instruct on MedDialog, reducing the hallucination rate from 3.71% to 0.31% [<xref ref-type="bibr" rid="ref17">17</xref>]. At the same time, in terms of information-related response rate and Anthropomorphism Score (AS).</p>
            </list-item>
            <list-item>
              <p>Chain-of-thought (CoT) and RLAIF+MoM: CureFun uses CoT and RAG, while Kumar et al’s [<xref ref-type="bibr" rid="ref26">26</xref>] RLAIF+MoM framework structures ambiguous symptoms, achieving up to 95% output clarity [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
            </list-item>
          </list>
        </sec>
        <sec>
          <title>Speech Interaction</title>
          <list list-type="order">
            <list-item>
              <p>TTS and STT integration: Takata et al [<xref ref-type="bibr" rid="ref28">28</xref>] use Google application programming interface (API) and Unity for synchronized speech [<xref ref-type="bibr" rid="ref51">51</xref>] and emotional facial expressions, outperforming traditional platforms in interaction realism. Thesen et al [<xref ref-type="bibr" rid="ref51">51</xref>] leverage Whisper-3 for immersive speech processing, achieving high student satisfaction.</p>
            </list-item>
            <list-item>
              <p>Multimodal enhancements: AMIE integrates image uploads (eg, skin photos) to support history-taking, outperforming PCPs in diagnostic accuracy [<xref ref-type="bibr" rid="ref19">19</xref>]. Ryu et al [<xref ref-type="bibr" rid="ref35">35</xref>] enhances realism with patient image uploads.</p>
            </list-item>
          </list>
          <p>In conclusion, LLM technologies achieve consistent and realistic virtual patient dialogues through multilevel prompt design, few-shot examples, and multiagent mechanisms. The use of KGs and structured data enhances information retrieval and medical history modeling, improving classification and recognition accuracy. Model fine-tuning techniques (SFT, LoRA, self-play, CoT, RLAIF+MoM) effectively reduce hallucination rate, optimize reasoning, and enhance diagnostic performance. Furthermore, speech interaction and multimodal integration improve interaction immersion and provide diagnostic support.</p>
        </sec>
      </sec>
      <sec>
        <title>Findings for RQ3: How Are Experimental Designs Structured to Evaluate LLM-Based Virtual Patient Systems, and What Evaluation Metrics Are Used?</title>
        <p>To address RQ3, this review synthesized 32 studies listed in <xref ref-type="table" rid="table3">Table 3</xref> to examine the experimental design, evaluation methods, and metrics of LLM-based virtual patient systems, focusing on medical education, high diagnostic accuracy, and effective clinical training. Of the 39 identified studies (see <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>), 7 were excluded due to the absence of formal evaluations (eg, Takata et al [<xref ref-type="bibr" rid="ref28">28</xref>] described planned behavioral experiments without data collection; Rodrigo et al [<xref ref-type="bibr" rid="ref46">46</xref>] outlined testing plans but reported no results; Geer [<xref ref-type="bibr" rid="ref31">31</xref>] focused on design without providing quantitative outcomes; Li et al [<xref ref-type="bibr" rid="ref47">47</xref>] emphasized system architecture but lacked performance metrics; Staples et al [<xref ref-type="bibr" rid="ref34">34</xref>] relied solely on qualitative feedback; Kumar et al [<xref ref-type="bibr" rid="ref26">26</xref>] discussed workflow but lacked quantified results; Lee et al [<xref ref-type="bibr" rid="ref15">15</xref>] provided expert Likert scores without statistical analysis). Due to high heterogeneity in evaluation metrics (eg, top-k accuracy, hallucination rate, CUQ, inconsistent scales and dimensions, lack of statistical information such as SDs or confidence intervals in many studies, and large variations in sample sizes that could bias pooled analyses), a meta-analysis was not performed. Instead, a structured narrative synthesis was adopted, supplemented with tabular summaries in <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref>, which present the technical approaches and corresponding evaluation metrics.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Merged quantitative and qualitative evaluation details for virtual patient systems.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <col width="150"/>
            <col width="150"/>
            <col width="110"/>
            <col width="110"/>
            <col width="160"/>
            <col width="200"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Authors</td>
                <td>Participants (N)</td>
                <td>Identity</td>
                <td>Comparisons</td>
                <td>Analysis method</td>
                <td>Results</td>
                <td>Evaluation type</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Du et al [<xref ref-type="bibr" rid="ref18">18</xref>]</td>
                <td>None</td>
                <td>AI<sup>a</sup></td>
                <td>Evolved vs Nonevolved</td>
                <td>Stats<sup>b</sup></td>
                <td>Relevance 0.7589; Faithfulness 0.8786</td>
                <td>Q<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>Holderried et al [<xref ref-type="bibr" rid="ref8">8</xref>]</td>
                <td>106</td>
                <td>Med students</td>
                <td>GPT-4 vs Human</td>
                <td>
                  <italic>κ</italic>
                  <sup>d</sup>
                </td>
                <td>Top-k accuracy &#62;99%;<italic>κ</italic>=0.832</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Tu et al [<xref ref-type="bibr" rid="ref21">21</xref>]</td>
                <td>20</td>
                <td>PCPs<sup>e</sup></td>
                <td>AMIE vs PCPs</td>
                <td>Stats</td>
                <td>AMIE better on 28/32 expert metrics</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Yamamoto et al [<xref ref-type="bibr" rid="ref20">20</xref>]</td>
                <td>145</td>
                <td>Med students</td>
                <td>AI vs Traditional</td>
                <td>t and MWU<sup>f</sup>; Likert<sup>g</sup></td>
                <td>OSCE: 28.1 vs 27.1 (<italic>P</italic><sup>h</sup>=.01)</td>
                <td>M<sup>i</sup></td>
              </tr>
              <tr valign="top">
                <td>Cook et al [<xref ref-type="bibr" rid="ref50">50</xref>]</td>
                <td>3</td>
                <td>Physicians</td>
                <td>GPT-4.0 vs 3.5</td>
                <td>1–6 scale, multivar.</td>
                <td>GPT-4.0 higher authenticity, feedback</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Haider et al [<xref ref-type="bibr" rid="ref16">16</xref>]</td>
                <td>None</td>
                <td>AI</td>
                <td>GPT-4o vs Claude vs Gemini</td>
                <td>Stats</td>
                <td>Nonsignificant differences; high baseline</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Brugge et al [<xref ref-type="bibr" rid="ref39">39</xref>]</td>
                <td>21</td>
                <td>Med students</td>
                <td>Feedback vs Control</td>
                <td>Wilcoxon, ICC<sup>j</sup>=0.924</td>
                <td>CRI-HTI: 3.60 vs 3.02 (<italic>P</italic><sup>k</sup>≈.05)</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Holderried et al [<xref ref-type="bibr" rid="ref22">22</xref>]</td>
                <td>28</td>
                <td>Med students</td>
                <td>None</td>
                <td>Stats; Spearman</td>
                <td>Script Q 60.3%; Answers 94.4%</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Leypold et al [<xref ref-type="bibr" rid="ref38">38</xref>]</td>
                <td>3</td>
                <td>Hand surgeons</td>
                <td>None</td>
                <td>Likert (1–5)</td>
                <td>Mean=4.6 (Understand 5.0; History 4.2)</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Borg et al [<xref ref-type="bibr" rid="ref40">40</xref>]</td>
                <td>15</td>
                <td>Med students</td>
                <td>Robot vs VP</td>
                <td><italic>t</italic> test; Thematic</td>
                <td>Authenticity 4.5 vs 3.9 (<italic>P</italic><sup>l</sup>=.04)</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Radel et al [<xref ref-type="bibr" rid="ref36">36</xref>]</td>
                <td>40</td>
                <td>Med students</td>
                <td>Feedback vs None</td>
                <td><italic>t</italic> test; Likert</td>
                <td>Improved scores with feedback (<italic>P</italic><sup>m</sup>&#60;.05)</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Luo et al [<xref ref-type="bibr" rid="ref42">42</xref>]</td>
                <td>184</td>
                <td>Med students</td>
                <td>LLMDP vs Traditional</td>
                <td><italic>t</italic> test; Pearson</td>
                <td>78.13 vs 67.08 (<italic>P</italic><sup>n</sup>&#60;.001)</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Thesen et al [<xref ref-type="bibr" rid="ref51">51</xref>]</td>
                <td>94</td>
                <td>Med students</td>
                <td>None</td>
                <td>Likert; <italic>t</italic> test</td>
                <td>Comfort 61% to 76% (<italic>P</italic><sup>o</sup>&#60;.001)</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Laverde et al [<xref ref-type="bibr" rid="ref24">24</xref>]</td>
                <td>86</td>
                <td>Med students</td>
                <td>Agent vs Others</td>
                <td>CUQ<sup>p</sup></td>
                <td>CUQ: 86.25/100</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Benfatah et al [<xref ref-type="bibr" rid="ref29">29</xref>]</td>
                <td>12</td>
                <td>Nursing students</td>
                <td>None</td>
                <td>Pearson; Likert</td>
                <td>Total score 19.42; Clarity r=0.701</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Geer [<xref ref-type="bibr" rid="ref31">31</xref>]</td>
                <td>None</td>
                <td>AI</td>
                <td>None</td>
                <td>Descr<sup>q</sup></td>
                <td>High similarity; no quant data</td>
                <td>QL<sup>r</sup></td>
              </tr>
              <tr valign="top">
                <td>Borg et al [<xref ref-type="bibr" rid="ref41">41</xref>]</td>
                <td>15</td>
                <td>Med students</td>
                <td>Robot vs VIC</td>
                <td>Wilcoxon; Text</td>
                <td>Authenticity 4.47 vs 3.93 (<italic>P</italic><sup>s</sup>≈.03)</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Ng et al [<xref ref-type="bibr" rid="ref49">49</xref>]</td>
                <td>100</td>
                <td>Med students</td>
                <td>Hybrid vs Baselines</td>
                <td>Acc<sup>t</sup>; failure and confusion</td>
                <td>Top-k accuracy 98.7%; failure 2.0%</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Wang et al [<xref ref-type="bibr" rid="ref32">32</xref>]</td>
                <td>N/S</td>
                <td>Experts</td>
                <td>Expert vs GPT-4</td>
                <td>Stats; Subjective</td>
                <td>Experts rated higher, GPT-4 underestimated</td>
                <td>QL</td>
              </tr>
              <tr valign="top">
                <td>Zheng et al [<xref ref-type="bibr" rid="ref48">48</xref>]</td>
                <td>N/S</td>
                <td>Experts</td>
                <td>None</td>
                <td>Weighted <italic>F</italic><sub>1</sub>-score; Fuzzy labels</td>
                <td>High professionalism and ethics</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Rose et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td>
                <td>None</td>
                <td>AI</td>
                <td>GPT-4o vs Llama3.1</td>
                <td>Stats</td>
                <td>GTPA@1 0.96; RareBench 0.45</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Liu et al [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
                <td>None</td>
                <td>AI</td>
                <td>Proposed vs Baselines</td>
                <td>Stats</td>
                <td>Hallucination Rate 0.31%; Anthropomorphism 0.87</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Chen et al [<xref ref-type="bibr" rid="ref33">33</xref>]</td>
                <td>25</td>
                <td>Patients and Psychiatrists</td>
                <td>Prompt D1–D4</td>
                <td>Stats</td>
                <td>Fluency 3.28; Empathy 3.43; Dx 55.56%</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Liao et al [<xref ref-type="bibr" rid="ref43">43</xref>]</td>
                <td>N/S</td>
                <td>Students, laypeople</td>
                <td>GPT-4 vs others</td>
                <td>Stats</td>
                <td>Dx 53.33%; Coverage rate:15.36%-33.89%%</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Johri et al [<xref ref-type="bibr" rid="ref44">44</xref>]</td>
                <td>None</td>
                <td>AI</td>
                <td>GPT-4 vs 3.5</td>
                <td>Stats</td>
                <td>MCQ 0.919; FRQ 0.684</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Bodonhelyi et al [<xref ref-type="bibr" rid="ref37">37</xref>]</td>
                <td>N/S</td>
                <td>Psych experts</td>
                <td>Accuser vs Rationalizer</td>
                <td>Likert; Emotion; Stats</td>
                <td>Realism 3.8 vs 3.7</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Rashidian et al [<xref ref-type="bibr" rid="ref30">30</xref>]</td>
                <td>2</td>
                <td>Clinicians</td>
                <td>AI vs Doctors</td>
                <td><italic>κ</italic>; Likert</td>
                <td>Symptom top-k accuracy 97.7%; <italic>κ</italic>=0.74</td>
                <td>M</td>
              </tr>
              <tr valign="top">
                <td>Tu et al [<xref ref-type="bibr" rid="ref45">45</xref>]</td>
                <td>20</td>
                <td>PCPs</td>
                <td>AMIE vs PCPs</td>
                <td>Stats</td>
                <td>AMIE better on 28/32 expert metrics</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Saab et al [<xref ref-type="bibr" rid="ref19">19</xref>]</td>
                <td>43</td>
                <td>Patients, specialists</td>
                <td>AMIE vs PCPs</td>
                <td>Stats; <italic>P</italic> values</td>
                <td><italic>Top</italic>-1 accuracy: 0.65 vs 0.53 (<italic>P</italic><sup>u</sup>&#60;.001)</td>
                <td>Q</td>
              </tr>
              <tr valign="top">
                <td>Li et al [<xref ref-type="bibr" rid="ref23">23</xref>]</td>
                <td>8</td>
                <td>Med experts</td>
                <td>Auto vs Manual</td>
                <td>Spearman; Pearson</td>
                <td><italic>ρ</italic>=0.81; <italic>r</italic>=0.85 (<italic>P</italic><sup>v</sup>&#60;.05)</td>
                <td>Q</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>AI: artificial intelligence.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Stats: statistical analysis.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>Q: Quantitative.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup><italic>κ</italic>: Cohen κ.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>PCP: primary care physician.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>t and MWU: <italic>t</italic> test/Mann-Whitney <italic>U</italic> test.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>Likert: Likert scale.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup><italic>P</italic>: Objective Structured Clinical Examination performance comparison between AI-trained and traditionally trained students showed a difference of 28.1 vs 27.1, respectively (Mann–Whitney <italic>U</italic> test, <italic>P</italic>=.01).</p>
            </fn>
            <fn id="table3fn9">
              <p><sup>i</sup>M: Mixed.</p>
            </fn>
            <fn id="table3fn10">
              <p><sup>j</sup>ICC: intraclass correlation coefficient.</p>
            </fn>
            <fn id="table3fn11">
              <p><sup>k</sup><italic>P</italic>: Feedback group scored 3.60 on CRI-HTI vs 3.02 in control group (ANOVA, ICC=0.924, <italic>P</italic>=.049).</p>
            </fn>
            <fn id="table3fn12">
              <p><sup>l</sup><italic>P</italic>: Robot vs virtual patient authenticity ratings were 4.5 vs 3.9 (<italic>t</italic> test; <italic>P</italic>=.04).</p>
            </fn>
            <fn id="table3fn13">
              <p><sup>m</sup><italic>P</italic>: Students receiving feedback showed higher Likert-scale scores compared to control (<italic>t</italic> test; <italic>P</italic>=.04).</p>
            </fn>
            <fn id="table3fn14">
              <p><sup>n</sup><italic>P</italic>: LLMDP-trained students scored 78.13 (SD 8.35) on history acquisition; traditional group scored 67.08 (SD 7.21), with a mean difference of 11.05 points (<italic>P</italic>&#60;.001).</p>
            </fn>
            <fn id="table3fn15">
              <p><sup>o</sup><italic>P</italic>: Comfort scores improved from 61% to 76% after intervention (<italic>t</italic> test; <italic>P</italic>&#60;.001).</p>
            </fn>
            <fn id="table3fn16">
              <p><sup>p</sup>CUQ: Chatbot Usability Questionnaire.</p>
            </fn>
            <fn id="table3fn17">
              <p><sup>q</sup>Descr: Descriptive.</p>
            </fn>
            <fn id="table3fn18">
              <p><sup>r</sup>QL: Qualitative.</p>
            </fn>
            <fn id="table3fn19">
              <p><sup>s</sup><italic>P</italic>: Robot vs VIC comparison yielded authenticity ratings of 4.47 vs 3.93 (Wilcoxon test; <italic>P</italic>=.035).</p>
            </fn>
            <fn id="table3fn20">
              <p><sup>t</sup>Acc: Accuracy.</p>
            </fn>
            <fn id="table3fn21">
              <p><sup>u</sup><italic>P</italic>: Original article did not provide comparative statistical values such as means or test statistics, only reported model accuracy with significance levels.</p>
            </fn>
            <fn id="table3fn22">
              <p><sup>v</sup><italic>P</italic>: Among the 8 evaluated cases, exact <italic>P</italic> values for comparisons 1 and 2 were <italic>P</italic>&#60;.001 and <italic>P</italic>=.04, respectively; the remaining 6 ranged from <italic>P</italic>=.003 to <italic>P</italic>=.011. As per reporting guidelines, <italic>P</italic>=.000 was converted to <italic>P</italic>&#60;.001.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Commonalities and Specificities in Experimental Design</title>
        <sec>
          <title>Commonalities</title>
          <p>Most studies involved medical or health professional students, typically 10-50 participants, with some including residents or practicing physicians as evaluators (usually 3-5 experts) [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. A few studies used AI agents for large-scale automated testing [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Tasks primarily covered core clinical competencies, including history-taking accuracy [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref36">36</xref>], diagnostic reasoning [<xref ref-type="bibr" rid="ref45">45</xref>], role-playing [<xref ref-type="bibr" rid="ref37">37</xref>], and multiturn dialogues [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
          <p>Evaluation paradigms were categorized into 3 types quantitative (18 studies) using randomized controlled trials (RCTs) or comparative experiments with statistical tests (eg, ANOVA, <italic>t</italic> tests, and correlation) [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]; qualitative (1 study) using expert interviews, thematic analysis, or questionnaires [<xref ref-type="bibr" rid="ref32">32</xref>]; and mixed (13 studies) integrating objective metrics (eg, top-k accuracy, and <italic>F</italic><sub>1</sub>-score) with subjective scales (eg, Likert, CUQ) [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. Comparative baselines included model comparisons (GPT-4 and 4o, Claude, and Gemini) [<xref ref-type="bibr" rid="ref16">16</xref>], AI versus human physicians [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref45">45</xref>], with or without feedback or platform comparisons [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>], and variations in prompt strategies, multimodality, or RAG and KG [<xref ref-type="bibr" rid="ref49">49</xref>]. Most studies applied blinding and reported rater consistency (eg, <italic>κ</italic>) [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Details of each study’s evaluation type are provided in <xref ref-type="table" rid="table3">Table 3</xref> (Evaluation type column).</p>
        </sec>
        <sec>
          <title>Specificities</title>
          <p>Some studies implemented AI self-play for scalability [<xref ref-type="bibr" rid="ref18">18</xref>], or AI doctor-AI patient automated evaluation [<xref ref-type="bibr" rid="ref16">16</xref>]. Social robot-LLM hybrids were used to enhance realism [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. Comparative baselines varied, including direct AI versus human physician comparisons [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref45">45</xref>], modular and hybrid architecture comparisons (eg, RASA or KG or LLM) [<xref ref-type="bibr" rid="ref49">49</xref>], and multimodel comparisons (GPT-4.5 and 4o, Claude 3.7, and Gemini 2.5) showing nonsignificant differences, indicating a high performance baseline [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Evaluation Metrics: Commonalities and Specificities</title>
        <p>Evaluation metrics in LLM-based virtual patient systems generally fall into 5 categories: clinical accuracy and knowledge, communication and interaction quality, robustness and stability, training efficacy and feedback quality, and system performance. Clinical accuracy and knowledge were assessed using metrics such as top-k accuracy [<xref ref-type="bibr" rid="ref19">19</xref>], GTPA@k [<xref ref-type="bibr" rid="ref25">25</xref>], information coverage [<xref ref-type="bibr" rid="ref43">43</xref>], hallucination rate [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], and fidelity and relevance [<xref ref-type="bibr" rid="ref18">18</xref>], evaluating diagnostic and information-gathering capabilities. Communication and interaction quality were measured through readability (Flesch and Flesch–Kincaid) [<xref ref-type="bibr" rid="ref10">10</xref>], CUQ [<xref ref-type="bibr" rid="ref22">22</xref>], and Anthropomorphism Score [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref37">37</xref>], reflecting language clarity and interaction naturalness. Robustness and stability were evaluated via paraphrase robustness [<xref ref-type="bibr" rid="ref10">10</xref>], leak resistance [<xref ref-type="bibr" rid="ref18">18</xref>], and rater consistency metrics, including <italic>κ</italic> [<xref ref-type="bibr" rid="ref8">8</xref>] and intraclass correlation coefficient (ICC) [<xref ref-type="bibr" rid="ref39">39</xref>]. Training efficacy and feedback quality focused on learning outcomes and user experience, assessed through Objective Structured Clinical Examination (OSCE) score improvements [<xref ref-type="bibr" rid="ref39">39</xref>], and usability scales (such as CUQ). System performance metrics, including latency, failure rate [<xref ref-type="bibr" rid="ref49">49</xref>], and confusion or clarification rates [<xref ref-type="bibr" rid="ref49">49</xref>], captured efficiency and reliability. Specificities included unique metrics such as span-level <italic>F</italic><sub>1</sub>-score [<xref ref-type="bibr" rid="ref10">10</xref>] for knowledge extraction, cosine similarity-based fidelity and relevance [<xref ref-type="bibr" rid="ref18">18</xref>], weighted <italic>F</italic><sub>1</sub>-score or fuzzy labels for professionalism and ethics [<xref ref-type="bibr" rid="ref39">39</xref>], and system-level confusion, clarification, or failure rates. Automated evaluation metrics such as GTPA@k [<xref ref-type="bibr" rid="ref25">25</xref>] provided standardized measures for diagnosis, while expert ratings revealed realism biases, including underestimation by GPT-4. Usability was further examined through instruments such as the CUQ (score of 77/100), 7-dimensional Likert scales combined with ANOVA for accuracy, realism, and empathy, and system latency and confusion reporting (0.5%). These metrics collectively enable comprehensive assessment of both the technical performance and experiential quality of LLM-based virtual patient systems. The calculation formula for the indicators is detailed in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>.</p>
      </sec>
      <sec>
        <title>Evaluation Results</title>
        <p>The synthesis of evaluation results from the 32 included studies indicates that LLM-based virtual patient systems demonstrate high diagnostic accuracy across key metrics. Quantitative results showed top-k accuracy ranging from 0.45 to 0.98 (based on 34 models and 300 patient cases); Saab et al [<xref ref-type="bibr" rid="ref19">19</xref>] reported that AMIE achieved a Top-1 Accuracy of 0.65, compared to 0.53 for PCPs, with <italic>P</italic>&#60;.001. Information coverage averaged 33.89% to 94.4% (eg, Liao et al [<xref ref-type="bibr" rid="ref43">43</xref>] reported 33.89%, based on 150 cases; Holderried et al [<xref ref-type="bibr" rid="ref22">22</xref>] reported 94.4%, based on 502 questions explicitly covered in case scripts, with a total of 826 answers), while hallucination rate remained low, ranging from 0.31% to 5% (eg, Liu et al [<xref ref-type="bibr" rid="ref17">17</xref>] conducted experiments with up to 5 dialogue rounds, achieving a hallucination rate of 0.31%).</p>
        <p>In terms of communication quality, CUQ scores ranging from 77 to 86.25 (eg, Laverde et al [<xref ref-type="bibr" rid="ref24">24</xref>] reported 86.25). Training effectiveness was evident in OSCE performance (eg, Yamamoto et al [<xref ref-type="bibr" rid="ref20">20</xref>] reported posttraining 28.1 vs pretraining 27.1, <italic>P</italic>=.01; Luo et al [<xref ref-type="bibr" rid="ref42">42</xref>] reported that medical students trained using the LLM-based digital patient system (LLMDP) achieved a medical history acquisition score of 78.13 (SD 8.35), while the control group trained with traditional real patients scored 67.08 (SD 7.21). The difference between groups was 11.05 points, with <italic>P</italic>&#60;.001, indicating a highly statistically significant difference). Robustness metrics such as <italic>κ</italic> ranged from 0.74 to 0.832 (eg, Holderried et al [<xref ref-type="bibr" rid="ref8">8</xref>] reported 0.832; Rashidian et al [<xref ref-type="bibr" rid="ref30">30</xref>] reported 0.74).</p>
        <p>Mixed methods studies reported improvements in authenticity (eg, Borg et al [<xref ref-type="bibr" rid="ref40">40</xref>] reported postuse 4.5 vs preuse 3.9; <italic>P</italic>=.04) and comfort (Thesen et al [<xref ref-type="bibr" rid="ref51">51</xref>] reported that among 69 participants, average preuse comfort was 61%, increasing to 76% postuse, indicating higher student-reported comfort after training; <italic>P</italic>&#60;.001). Automated baseline systems, such as EvoPatient, achieved a relevance of 0.7589 and a faithfulness of 0.8786 (Du et al [<xref ref-type="bibr" rid="ref18">18</xref>]), while multimodel comparisons showed no significant differences among top LLMs (Haider et al [<xref ref-type="bibr" rid="ref16">16</xref>]).</p>
        <p>These results highlight the effectiveness of LLM-based systems in simulating realistic interactions, although specific metrics—such as the low failure rate of 2.0% reported by Ng et al [<xref ref-type="bibr" rid="ref49">49</xref>] (based on 200 dialogue trials) and the high ICC of 0.924 reported by Brugge et al [<xref ref-type="bibr" rid="ref39">39</xref>]—indicate that further standardization is needed.</p>
        <p>The reviewed studies exhibit consistent commonalities in participant composition, task objectives, and evaluation paradigms, typically involving small to medium student samples and multisource evaluations with experts and AI, focusing on history-taking and diagnostic reasoning. Specificities are evident in comparative settings (AI vs humans, multimodel comparisons) and statistical methods such as bootstrap, false discovery rate (FDR), and ICC. Across studies, 5 key evaluation aspects were assessed using objective metrics. Clinical accuracy and knowledge were measured using top-k accuracy [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref21">21</xref>], GTPA@k [<xref ref-type="bibr" rid="ref25">25</xref>], information coverage [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], and span-level <italic>F</italic><sub>1</sub>-score [<xref ref-type="bibr" rid="ref18">18</xref>], capturing high diagnostic accuracy and knowledge acquisition. Authenticity was evaluated using hallucination rate [<xref ref-type="bibr" rid="ref17">17</xref>] and fidelity metrics [<xref ref-type="bibr" rid="ref18">18</xref>]. Interaction experience was assessed via CUQ [<xref ref-type="bibr" rid="ref24">24</xref>] and Anthropomorphism Score [<xref ref-type="bibr" rid="ref17">17</xref>], reflecting communication quality and user perception. Robustness and consistency were quantified using <italic>κ</italic> [<xref ref-type="bibr" rid="ref30">30</xref>] and ICC [<xref ref-type="bibr" rid="ref39">39</xref>]. System performance was captured through latency, failure rate, and confusion rate [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. These categories collectively enable a comprehensive assessment of both technical and experiential quality, highlighting the systems’ effectiveness while indicating areas for further standardization and evaluation refinement.</p>
      </sec>
      <sec>
        <title>Finding for RQ4: What Public Datasets Are Available, and What Are Their Characteristics for Training, Simulating, and Evaluating Medical History-Taking in Virtual Patient Systems?</title>
        <p>A variety of datasets support the development and evaluation of virtual patient systems, ranging from real-world EHRs to structured synthetic clinical scenarios. <xref ref-type="table" rid="table4">Table 4</xref> summarizes key datasets used in recent studies, which are either publicly available or accessible through formal application processes. These datasets provide diverse clinical data resources for effective clinical training and evaluation.</p>
        <p>MIMIC-III (Medical Information Mart for Intensive Care-III) is a comprehensive publicly available medical dataset, containing detailed EHR data from over 40,000 intensive care unit (ICU) patients [<xref ref-type="bibr" rid="ref10">10</xref>]. It covers multiple disease types, including internal medicine and neurological and rheumatological conditions, serving as a critical resource for virtual patient system development. However, its ICU-focused nature, primarily reflecting acute or severe diseases, limits its applicability to noncritical care scenarios, such as outpatient consultations, mental health disorders, or chronic disease management [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
        <p>DDxPlus provides clinical dialogue data for respiratory diseases [<xref ref-type="bibr" rid="ref25">25</xref>], suitable for training virtual patient systems in specific domains within internal medicine. The iCraft-MD dataset focuses on dermatological cases [<xref ref-type="bibr" rid="ref25">25</xref>], and RareBench contains region-specific subsets of rare and multiple disease types [<xref ref-type="bibr" rid="ref25">25</xref>]. While valuable for specialized applications, their scope is limited for building generalized, multitask virtual patient systems.</p>
        <p>The medical-NLP corpus provides a broad range of clinical dialogues and records [<xref ref-type="bibr" rid="ref18">18</xref>], while CCKS 2019 offers a Chinese medical KG dataset, enabling multilingual history-taking simulations [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. The Open-i dataset includes multimodal chest X-ray images and textual descriptions, supporting dynamic image generation alongside history-taking [<xref ref-type="bibr" rid="ref47">47</xref>]. MedQA provides multiple-choice and long-form medical question-answering data, useful for fine-tuning LLMs for diagnostic dialogues [<xref ref-type="bibr" rid="ref45">45</xref>]. These datasets enhance multimodal and question-answering capabilities, though their coverage is narrower than MIMIC-III.</p>
        <p>In summary, current datasets primarily focus on critical care or specific medical domains, with limited publicly available, diverse resources. These datasets have limitations in generalizability, multitask applicability, and multilingual support for virtual patient systems. Combining MIMIC-III (broad coverage) [<xref ref-type="bibr" rid="ref10">10</xref>], DDxPlus and iCraft-MD (specialized domains) [<xref ref-type="bibr" rid="ref25">25</xref>], and CCKS 2019 (multilingual support) [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref56">56</xref>] provides partial coverage. However, further development of diverse, emotionally annotated, and non-English datasets is needed to enhance system generalizability and conversational fidelity.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Datasets used in virtual patient systems.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <col width="150"/>
            <col width="100"/>
            <col width="400"/>
            <col width="350"/>
            <thead>
              <tr valign="top">
                <td>Dataset Name</td>
                <td>Index</td>
                <td>Description</td>
                <td>Access requirements</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>MIMIC-III<sup>a</sup></td>
                <td>[<xref ref-type="bibr" rid="ref10">10</xref>]</td>
                <td>EHR<sup>b</sup> database from Beth Israel Deaconess Medical Center, over 40,000 ICU<sup>c</sup> patients (2001-2012), including vital signs, medications, laboratory results, diagnostic codes.</td>
                <td>Access via PhysioNet; requires CITI training and DUA [<xref ref-type="bibr" rid="ref52">52</xref>].</td>
              </tr>
              <tr valign="top">
                <td>DDxPlus</td>
                <td>[<xref ref-type="bibr" rid="ref25">25</xref>]</td>
                <td>Synthetic dataset for respiratory diseases, with clinical dialogues and diagnoses.</td>
                <td>CC-BY<sup>d</sup>; publicly available, allows commercial use with attribution [<xref ref-type="bibr" rid="ref53">53</xref>].</td>
              </tr>
              <tr valign="top">
                <td>iCraft-MD</td>
                <td>[<xref ref-type="bibr" rid="ref25">25</xref>]</td>
                <td>Synthetic dermatology dataset from public medical question banks and expert cases.</td>
                <td>MIT License; publicly available [<xref ref-type="bibr" rid="ref54">54</xref>].</td>
              </tr>
              <tr valign="top">
                <td>RareBench</td>
                <td>[<xref ref-type="bibr" rid="ref25">25</xref>]</td>
                <td>Rare disease dataset with regional subsets (Europe, Canada, and China).</td>
                <td>Apache-2.0; publicly available [<xref ref-type="bibr" rid="ref55">55</xref>].</td>
              </tr>
              <tr valign="top">
                <td>medical-nlp</td>
                <td>[<xref ref-type="bibr" rid="ref18">18</xref>]</td>
                <td>Medical NLP<sup>e</sup> corpus with clinical dialogues and records.</td>
                <td>GNU General Public License v3.0; publicly available on GitHub [<xref ref-type="bibr" rid="ref18">18</xref>].</td>
              </tr>
              <tr valign="top">
                <td>CCKS 2019</td>
                <td>[<xref ref-type="bibr" rid="ref17">17</xref>]</td>
                <td>Chinese medical knowledge graph dataset with entity recognition, relation extraction, QA<sup>f</sup> tasks.</td>
                <td>Research use only; cite source [<xref ref-type="bibr" rid="ref56">56</xref>].</td>
              </tr>
              <tr valign="top">
                <td>Open-i</td>
                <td>[<xref ref-type="bibr" rid="ref47">47</xref>]</td>
                <td>Multimodal dataset with 3314 chest x-ray images and textual descriptions.</td>
                <td>Open Data Commons Open Database License; publicly available via NIH Open-i project [<xref ref-type="bibr" rid="ref57">57</xref>].</td>
              </tr>
              <tr valign="top">
                <td>MedQA</td>
                <td>[<xref ref-type="bibr" rid="ref45">45</xref>]</td>
                <td>Multiple-choice and long-form medical QA dataset for diagnostics.</td>
                <td>Publicly available via GitHub [<xref ref-type="bibr" rid="ref45">45</xref>].</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>MIMIC-III: Medical Information Mart for Intensive Care-III.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>EHR: electronic health record.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>ICU: intensive care unit.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>CC-BY: Creative Commons Attribution.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>NLP: natural language processing.</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>QA: question-answering.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Current LLM-based virtual patient systems exhibit significant limitations in disease coverage, complex case simulation, multimorbidity representation, specialty applicability, multimodal capabilities, and standardization of evaluation metrics, indicating a need for systematic optimization to enhance clinical fidelity, educational adaptability, and interaction quality.</p>
      </sec>
      <sec>
        <title>Limitations and Future Directions in Virtual Patient Type Simulation Research</title>
        <p>The systematic review reveals several limitations in current virtual patient systems regarding disease coverage, case complexity modeling, and support across medical specialties [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. First, research is heavily concentrated on internal medicine (eg, gastrointestinal, respiratory, and metabolic disorders) and mental health disorders, accounting for over half of the studies [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. This focus reflects the strengths of LLMs in language-driven tasks and the availability of relevant data, but it highlights significant gaps in surgical and orthopedic, ophthalmological, and dermatological domains. Scenarios requiring procedural operations, image recognition, or multimodal interactions are underrepresented, with current systems lacking effective modeling mechanisms [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. This finding aligns with RQ1, indicating that most simulations focus on specific disease categories, leaving certain specialty areas insufficiently covered.</p>
        <p>Although some studies involve multiple disease types, current virtual patient systems mainly use single-disease trajectories, with each virtual patient representing only one disease [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. multimorbidity simulations—patients with 2 or more coexisting chronic or acute diseases—are limited. Clinical multimorbidity requires complex decisions, including drug interactions, overlapping symptoms, and conflicting management priorities. Dataset analysis for RQ4 shows most datasets focus on a single primary disease, lacking comprehensive multimorbidity cases. Without simulating this complexity, virtual patient systems have limited use in advanced medical education and clinical reasoning. Future research should emphasize role construction and dialogue design for multimorbidity scenarios to support integrated management across multiple diseases.</p>
        <p>Furthermore, current systems exhibit instability in simulating vague or atypical symptoms (eg, chronic fatigue, low mood, attention deficits), demonstrating weak extraction of unstructured chief complaints and incoherent reasoning processes, which limits support for comprehensive clinical assessments [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Incorporating KGs and CoT reasoning mechanisms may improve knowledge organization and causal chain construction, enhancing models’ reasoning and response capabilities in complex clinical scenarios [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. Open-source KGs, such as UMLS (Unified Medical Language System) [<xref ref-type="bibr" rid="ref60">60</xref>], SNOMED-CT (Systematized Nomenclature of Medicine - Clinical Terms) [<xref ref-type="bibr" rid="ref61">61</xref>] can be integrated to provide structured, high-coverage medical knowledge. Leveraging these KGs allows LLMs to reference verified entities and relationships during dialogue generation, reducing hallucinations and enhancing reasoning in multimorbidity or complex symptom scenarios.</p>
        <p>Additionally, coverage of specialty domains remains limited, particularly in surgical and orthopedic, ophthalmological, and dermatological contexts requiring procedural or visual recognition skills. These fields show constraints in case complexity, multimodal interaction support, and procedural training, limiting educational adaptability and clinical fidelity [<xref ref-type="bibr" rid="ref43">43</xref>]. This observation is consistent with findings from RQ1 and RQ4, indicating that certain specialty cases and complex patient scenarios remain underrepresented.</p>
        <p>In summary, current virtual patient systems require systematic optimization in disease coverage balance, case complexity, multimorbidity simulation, specialty applicability, multimodal capabilities, and interaction depth. These improvements are expected to enhance clinical fidelity, educational adaptability, and natural human-computer interaction, providing a foundation for advancing intelligent medical education and clinical decision support systems.</p>
      </sec>
      <sec>
        <title>Technical Challenges and Solutions of LLM-Based Virtual Patient Simulation</title>
        <p>In virtual patient simulation, prompts guide LLMs to generate dialogues conforming to specific role characteristics. However, overly long prompts lead to information loss, affecting dialogue quality and coherence. Specifically, GPT-style models based on autoregressive decoder architectures process prompts sequentially from left to right, prioritizing information at the beginning (primacy effect) and end (recency effect), while middle information may be ignored or inadequately processed, impacting content completeness and accuracy [<xref ref-type="bibr" rid="ref62">62</xref>]. Additionally, few-shot learning maintains role settings using limited historical dialogues, but balancing information completeness and avoiding overload in multiturn complex dialogues remains challenging. To address this, core information should be prioritized at the prompt’s beginning and end, with emotional content placed in the middle to ensure accurate and consistent information transmission.</p>
        <p>Compared to single-prompt methods, multiagent frameworks decompose dialogue generation into modular components, with agents handling specific functions such as patient history or simulated emotions, or leveraging adversarial training to enhance dialogue quality [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. However, multiagent systems increase computational costs, as each agent independently calls APIs, leading to higher financial burdens. Additionally, fine-tuning using older APIs may cause “memory loss,” affecting dialogue context continuity. Dialogue saving and restoration mechanisms are necessary to ensure information consistency. To reduce costs and improve efficiency, dialogue compression and summarization mechanisms can simplify context input, and noncritical tasks can be assigned to lower-cost APIs to balance performance and expense.</p>
        <p>Research on fine-tuning GPT-style models and integrating KGs remains limited. Liu et al [<xref ref-type="bibr" rid="ref17">17</xref>] proposed a SFT strategy using synthetic medical records and manually annotated dialogues, significantly enhancing the realism and anthropomorphism of patient history collection dialogues on the Qwen-72B model (Alibaba Cloud), reducing Hallucination Rate from 3.71% to 0.31%. However, their training process did not explicitly embed disease names into the Transformer model, limiting specific disease knowledge mastery. Prior studies indicate that incorporating disease names as labels or input features improves semantic understanding and generalization, particularly for simulating patient histories for specific diseases [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. Combining CoT methods with fine-tuning to guide step-by-step reasoning for complex medical knowledge is underexplored and presents a potential direction for medical dialogue systems.</p>
        <p>KGs systematically represent structured medical knowledge, but there is a lack of studies comparing disease embeddings trained by LLMs with KG embeddings to evaluate differences in knowledge representation and reasoning capabilities. KGs’ clear entity and relation structures complement LLMs’ limitations in sparse knowledge or verification, while LLM-trained embeddings as a baseline reveal constraints in semantic understanding and improvements from KG enhancement. Future research should establish a unified evaluation framework integrating supervised fine-tuning, LoRA, and CoT reasoning to explore the complementary roles of KG and LLM embeddings, advancing medical dialogue models in logical reasoning and knowledge accuracy.</p>
      </sec>
      <sec>
        <title>Evaluation Design Suggestions and Summary of Metrics</title>
        <p>Evaluation methods and metrics in LLM-based virtual patient systems are diverse, reflecting the multidimensional nature of performance and educational outcomes. However, the lack of standardized frameworks hinders cross-study comparison and generalizability. Diagnostic accuracy is often measured by top-k accuracy (eg, 65% [<xref ref-type="bibr" rid="ref19">19</xref>]) and GTPA@k [<xref ref-type="bibr" rid="ref25">25</xref>], though these coarse metrics may not capture system capabilities. Information Coverage and Hallucination Rate assess retrieval fidelity but usually rely on manual verification. Interaction quality metrics—such as semantic similarity (0.7589 [<xref ref-type="bibr" rid="ref18">18</xref>]) and user scores (eg, CUQ=77 [<xref ref-type="bibr" rid="ref22">22</xref>])—highlight interactivity but are limited by subjectivity and small samples. External studies also report high usability, such as SUS=88.1[<xref ref-type="bibr" rid="ref31">31</xref>], a 10-item scale measuring ease of use, confidence, and learnability, reinforcing the systems’ educational value despite inconsistent evaluation standards.</p>
        <p>Moreover, system performance assessment remains insufficient. Some studies report response delays affecting conversation naturalness, but lack quantitative measures or systematic evaluations. The absence of standardized key performance indicators exacerbates framework fragmentation, hindering effective implementation and broader application in medical education and effective clinical training [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
        <p>To address fragmented metrics, a unified and scientifically grounded evaluation framework is necessary. Key performance indicators with recommended thresholds are proposed to guide system design and assessment:</p>
        <list list-type="bullet">
          <list-item>
            <p>Top-1 accuracy (≥0.80): AMIE achieved a top-1 accuracy of 0.65, outperforming primary care physicians (0.53) [<xref ref-type="bibr" rid="ref19">19</xref>], while <italic>GTPA</italic>@1 reached 0.96 [<xref ref-type="bibr" rid="ref25">25</xref>]. A threshold of 0.80 ensures reliable diagnostic performance.</p>
          </list-item>
          <list-item>
            <p>Hallucination rate (≤0.05): GPT-4o demonstrated a Hallucination Rate of 0.31% [<xref ref-type="bibr" rid="ref17">17</xref>], with rates below 5% reported in [<xref ref-type="bibr" rid="ref19">19</xref>], supporting medical safety standards.</p>
          </list-item>
          <list-item>
            <p>Information coverage (≥0.50): coverage of critical history items was 33.89% [<xref ref-type="bibr" rid="ref43">43</xref>], indicating room for improvement. A 50% threshold ensures adequate information capture.</p>
          </list-item>
          <list-item>
            <p>Empathy and Anthropomorphism Score (≥0.75, standardized 0-1 scale): GPT-4o scored 0.87 [<xref ref-type="bibr" rid="ref17">17</xref>], and [<xref ref-type="bibr" rid="ref37">37</xref>] reported approximately 0.76, indicating desired human-like interaction and empathy.</p>
          </list-item>
          <list-item>
            <p>Usability (SUS≥80, CUQ≥75): SUS of 88.1 [<xref ref-type="bibr" rid="ref59">59</xref>] and CUQ of 77 [<xref ref-type="bibr" rid="ref22">22</xref>] meet standards for satisfactory usability.</p>
          </list-item>
          <list-item>
            <p>Robustness (leak resistance≥0.90): a value of 0.9412 was reported in [<xref ref-type="bibr" rid="ref18">18</xref>], indicating compliance with privacy and ethical requirements.</p>
          </list-item>
          <list-item>
            <p>Rater consistency (<italic>κ</italic> and ICC≥0.80): high interrater reliability was observed, with ICC=0.924 [<xref ref-type="bibr" rid="ref39">39</xref>] and <italic>κ</italic>=0.832 [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Challenges of Data Diversity: Limitations in Corpus Coverage and Adaptability</title>
        <p>Existing datasets for training and evaluating LLM-based virtual patient systems are diverse, encompassing real-world EHRs, synthetic clinical scenarios, and multimodal or multilingual resources [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. <xref ref-type="table" rid="table4">Table 4</xref> summarizes key datasets, which are either publicly available or accessible through formal application processes. Analysis reveals several limitations impacting system development.</p>
        <p>Mainstream datasets like MIMIC-III primarily reflect intensive care scenarios, containing records from ICU patients with acute or critical conditions, often within internal medicine or neurological and rheumatological categories [<xref ref-type="bibr" rid="ref10">10</xref>]. This bias limits generalizability to nonacute settings, such as outpatient consultations, mental health disorders, or chronic disease management. Specialized datasets, including DDxPlus (internal medicine), iCraft-MD (dermatological), and RareBench (rare and multiple disease types), provide tailored resources [<xref ref-type="bibr" rid="ref25">25</xref>]. While valuable for specific domains, their narrow coverage limits suitability for general-purpose or multitask system training.</p>
        <p>Linguistic and cultural diversity is limited, as most corpora are English-based and originate from Western health care systems [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. The lack of datasets in other languages, such as Chinese [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref56">56</xref>], and integration with local KGs or region-specific disease contexts constrains performance in multilingual and cross-cultural environments. Modality limitations are evident; most datasets provide textual information and lack multimodal inputs like images, speech, or physiological signals [<xref ref-type="bibr" rid="ref47">47</xref>], restricting interaction realism and diagnostic reasoning.</p>
        <p>Data accessibility and format heterogeneity affect usability. Access requirements, annotation styles, field definitions, and dialogue formats vary [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref56">56</xref>], hindering integration and comprehensive training. Standardization of data formats and unified interfaces is necessary to reduce development costs and support broader adoption. Additionally, task alignment poses challenges, as datasets like MedQA [<xref ref-type="bibr" rid="ref45">45</xref>] are structured for multiple-choice or question-answering tasks, requiring extensive adaptation for dialogue generation.</p>
        <p>Combining broadly covering datasets like MIMIC-III [<xref ref-type="bibr" rid="ref10">10</xref>] with domain-specific (DDxPlus, iCraft-MD, RareBench) [<xref ref-type="bibr" rid="ref25">25</xref>] and multilingual or multimodal datasets (CCKS 2019 [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref56">56</xref>], Open-i [<xref ref-type="bibr" rid="ref47">47</xref>], medical-nlp [<xref ref-type="bibr" rid="ref18">18</xref>], MedQA [<xref ref-type="bibr" rid="ref45">45</xref>]) can partially address gaps. However, developing diverse, emotionally annotated, non-English, and multimodal datasets is essential to enhance generalizability, robustness, and interaction fidelity.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>This systematic review, conducted per PRISMA 2020 guidelines, evaluated studies (January 2020-August 18, 2025) on LLM-based virtual patient systems for medical history collection, sourced from 9 databases PubMed, Scopus, Web of Science, IEEE Xplore, ACM Digital Library, Springer, ERIC, arXiv, and ACL Anthology. Following rigorous screening, deduplication, and quality appraisal, 6 high-quality and 33 moderate-quality studies were included, addressing 4 research questions, simulated patient types, performance-enhancing technologies, experimental designs, and evaluation metrics.</p>
        <p>Key findings include (1) systems primarily simulate internal medicine and mental health disorders (acute and chronic), with limited coverage of rare and multiple disease types, multimorbidity, and specialties like surgical and orthopedic, neurological and rheumatological, dermatological, and ophthalmological, restricting applicability in complex clinical reasoning and education [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. (2) Technologies such as role-based prompts, few-shot learning, multiagent frameworks, KGs, and fine-tuning (eg, SFT, LoRA, CoT, RLAIF+MoM) enhance dialogue coherence, retrieval accuracy (+16.02% with KGs) [<xref ref-type="bibr" rid="ref10">10</xref>], and high diagnostic accuracy, while multimodal integration (eg, speech) improves immersion [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. (3) Evaluations involved medical students and practitioners, using mixed methods (top-k accuracy, <italic>F</italic><sub>1</sub>-score, SUS, CUQ, and expert ratings) with comparisons across AI models, physicians, or prompt variations; small sample sizes (10-50 students and 3-10 experts) and inconsistent metrics limit generalizability [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. (4) Systems demonstrated high diagnostic accuracy: top-k accuracy 0.45-0.98, information coverage 33.89%-94.4%, Hallucination Rate 0.31%-5%, and high usability (SUS≥80), often outperforming junior physicians [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. Dataset limitations (eg, MIMIC-III ICU bias, restricted access, low multilingual and multimodal diversity) hinder cross-study comparability [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].</p>
        <p>The key discussion points are summarized: (1) Disease coverage is imbalanced, favoring internal medicine and mental health disorders over surgical and orthopedic, dermatological, ophthalmological, and multimorbidity scenarios, limiting effective clinical training [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. Future systems should prioritize multimorbidity and diverse patient populations (cultural and linguistic) to enhance realism [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. (2) Prompt design endures information loss in long prompts; placing critical information at prompt ends and using dialogue compression or multiagent frameworks can mitigate this [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref60">60</xref>]. KG-LLM integration and fine-tuning improve performance, with potential for further gains via hybrid KG-CoT approaches [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. (3) Fragmented evaluation frameworks, inconsistent metrics, and small participant pools reduce reliability. A standardized framework with thresholds (eg, top-1 accuracy≥0.80, hallucination rate≤0.05, SUS≥80, CUQ≥75,κ and ICC≥0.80) and larger samples (50-100 students, 5-10 experts) is needed [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. (4) Dataset biases (eg, ICU focus), format heterogeneity, and privacy restrictions limit inclusivity. Open-access, ethically compliant, multimodal, and multilingual datasets are essential for equitable systems [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>].</p>
        <p>Future research should focus on large-scale longitudinal studies, standardized evaluation metrics, diverse open-access datasets (eg, UMLS [<xref ref-type="bibr" rid="ref60">60</xref>] and SNOMED-CT [<xref ref-type="bibr" rid="ref61">61</xref>]), and advanced integration of KGs, multimodal training, and optimized prompts to enhance the realism, high diagnostic accuracy, and fairness of LLM-based virtual patient systems in medical education [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref59">59</xref>].</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Database Search Strategies.</p>
        <media xlink:href="medinform_v14i1e79039_app1.docx" xlink:title="DOCX File , 13 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>PRISMA Checklist.</p>
        <media xlink:href="medinform_v14i1e79039_app2.docx" xlink:title="DOCX File , 26 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Joanna-Briggs Institute Quality Assessment Questionnaire.</p>
        <media xlink:href="medinform_v14i1e79039_app3.docx" xlink:title="DOCX File , 13 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Multidimensional assessment table.</p>
        <media xlink:href="medinform_v14i1e79039_app4.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Quality Assessment Result.</p>
        <media xlink:href="medinform_v14i1e79039_app5.docx" xlink:title="DOCX File , 27 KB"/>
      </supplementary-material>
      <supplementary-material id="app6">
        <label>Multimedia Appendix 6</label>
        <p>Risk of Bias Assessment for 39 Included Studies.</p>
        <media xlink:href="medinform_v14i1e79039_app6.docx" xlink:title="DOCX File , 26 KB"/>
      </supplementary-material>
      <supplementary-material id="app7">
        <label>Multimedia Appendix 7</label>
        <p>Calculation Formulas for Evaluation Metrics.</p>
        <media xlink:href="medinform_v14i1e79039_app7.docx" xlink:title="DOCX File , 23 KB"/>
      </supplementary-material>
      <supplementary-material id="app8">
        <label>Multimedia Appendix 8</label>
        <p>Other file screening process, source files, etc.</p>
        <media xlink:href="medinform_v14i1e79039_app8.zip" xlink:title="ZIP File  (Zip Archive), 1137 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AS</term>
          <def>
            <p>Anthropomorphism Score</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CoT</term>
          <def>
            <p>chain-of-thought</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">CUQ</term>
          <def>
            <p>Chatbot Usability Questionnaire</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">FDR</term>
          <def>
            <p>false discovery rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">ICC</term>
          <def>
            <p>intraclass correlation coefficient</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">ICU</term>
          <def>
            <p>intensive care unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">KG</term>
          <def>
            <p>knowledge graph</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">LLMDP</term>
          <def>
            <p>LLM-based digital patient system</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">LoRA</term>
          <def>
            <p>Low-Rank Adaptation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">MIMIC-III</term>
          <def>
            <p>Medical Information Mart for Intensive Care-III</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">OSCE</term>
          <def>
            <p>Objective Structured Clinical Examination</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">PCP</term>
          <def>
            <p>primary care physician</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">PRISMA</term>
          <def>
            <p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">RAG</term>
          <def>
            <p>retrieval-augmented generation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">RCT</term>
          <def>
            <p>randomized controlled trial</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">RLHF</term>
          <def>
            <p>reinforcement learning with human feedback</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb22">RQ</term>
          <def>
            <p>research question</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb23">SFT</term>
          <def>
            <p>supervised fine-tuning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb24">SNOMED-CT</term>
          <def>
            <p>Systematized Nomenclature of Medicine - Clinical Terms</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb25">SUS</term>
          <def>
            <p>System Usability Scale</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb26">UMLS</term>
          <def>
            <p>Unified Medical Language System</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We thank Sultan Qaboos University for partially funding this project. The corresponding author is also affiliated with Universiti Sains Malaysia (USM). Additional supporting materials related to this work are provided in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>.</p>
    </ack>
    <notes>
      <sec>
        <title>Funding</title>
        <p>The authors thank Sultan Qaboos University (Oman) for the funding of this paper.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Neelakantan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shyam</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sastry</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Language models are few-shot learners</article-title>
          <year>2020</year>
          <conf-name>NIPS'20: Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>2020 December 06</conf-date>
          <conf-loc>Red Hook, NY, United States</conf-loc>
          <fpage>1877</fpage>
          <lpage>1901</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
            <name name-style="western">
              <surname>Achiam</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Adler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmad</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Akkaya</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Aleman</surname>
              <given-names>FL</given-names>
            </name>
            <name name-style="western">
              <surname>Almeida</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Altenschmidt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Anadkat</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Avila</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Babuschkin</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Balaji</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Balcom</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Baltescu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bavarian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Belgum</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 technical report</article-title>
          <source>ArXiv. Preprint posted online on March 4,</source>
          <year>2024</year>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>SWH</given-names>
            </name>
          </person-group>
          <article-title>Is technology enhanced learning cost-effective to improve skills?: the Monash objective structured clinical examination virtual experience</article-title>
          <source>Simul Healthc</source>
          <year>2022</year>
          <volume>17</volume>
          <issue>2</issue>
          <fpage>131</fpage>
          <lpage>135</lpage>
          <pub-id pub-id-type="doi">10.1097/SIH.0000000000000526</pub-id>
          <pub-id pub-id-type="medline">33273417</pub-id>
          <pub-id pub-id-type="pii">01266021-202204000-00008</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>Creating virtual patients using large language models: scalable, global, and low cost</article-title>
          <source>Med Teach</source>
          <year>2025</year>
          <volume>47</volume>
          <issue>1</issue>
          <fpage>40</fpage>
          <lpage>42</lpage>
          <pub-id pub-id-type="doi">10.1080/0142159X.2024.2376879</pub-id>
          <pub-id pub-id-type="medline">38992981</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>An intelligent virtual standard patient for medical students training based on oral knowledge graph</article-title>
          <source>IEEE Trans Multimedia</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>6132</fpage>
          <lpage>6145</lpage>
          <pub-id pub-id-type="doi">10.1109/tmm.2022.3205456</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Babu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Boddu</surname>
              <given-names>SB</given-names>
            </name>
          </person-group>
          <article-title>BERT-based medical chatbot: enhancing healthcare communication through natural language understanding</article-title>
          <source>Explor Res Clin Soc Pharm</source>
          <year>2024</year>
          <volume>13</volume>
          <fpage>100419</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2667-2766(24)00014-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.rcsop.2024.100419</pub-id>
          <pub-id pub-id-type="medline">38495953</pub-id>
          <pub-id pub-id-type="pii">S2667-2766(24)00014-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC10940906</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chow</surname>
              <given-names>JCL</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Generative pre-trained transformer-empowered healthcare conversations: current trends, challenges, and future directions in large language model-enabled medical chatbots</article-title>
          <source>BioMedInformatics</source>
          <year>2024</year>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>837</fpage>
          <lpage>852</lpage>
          <pub-id pub-id-type="doi">10.3390/biomedinformatics4010047</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Holderried</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Stegemann-Philipps</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Herrmann-Werner</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Festl-Wietek</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Holderried</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Eickhoff</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mahling</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A language model-powered simulated patient with automated feedback for history taking: prospective study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e59213</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e59213/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/59213</pub-id>
          <pub-id pub-id-type="medline">39150749</pub-id>
          <pub-id pub-id-type="pii">v10i1e59213</pub-id>
          <pub-id pub-id-type="pmcid">PMC11364946</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>K-J</given-names>
            </name>
          </person-group>
          <article-title>The feasibility of using generative artificial intelligence for history taking in virtual patients</article-title>
          <source>BMC Res Notes</source>
          <year>2025</year>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>80</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcresnotes.biomedcentral.com/articles/10.1186/s13104-025-07157-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13104-025-07157-8</pub-id>
          <pub-id pub-id-type="medline">39994780</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13104-025-07157-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC11849343</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gallifant</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gupte</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M-L</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xing</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Danielle</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Bitterman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Assimes</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>AIPatient: simulating patients with EHRs and LLM powered agentic workflow</article-title>
          <source>ArXiv. Preprint posted online on July 29,</source>
          <year>2025</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2409.18924</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lucas</surname>
              <given-names>HC</given-names>
            </name>
            <name name-style="western">
              <surname>Upperman</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>JR</given-names>
            </name>
          </person-group>
          <article-title>A systematic review of large language models and their implications in medical education</article-title>
          <source>Med Educ</source>
          <year>2024</year>
          <volume>58</volume>
          <issue>11</issue>
          <fpage>1276</fpage>
          <lpage>1285</lpage>
          <pub-id pub-id-type="doi">10.1111/medu.15402</pub-id>
          <pub-id pub-id-type="medline">38639098</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>García-Torres</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Vicente Ripoll</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Fernández Peris</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mira Solves</surname>
              <given-names>JJ</given-names>
            </name>
          </person-group>
          <article-title>Enhancing clinical reasoning with virtual patients: a hybrid systematic review combining human reviewers and ChatGPT</article-title>
          <source>Healthcare (Basel)</source>
          <year>2024</year>
          <volume>12</volume>
          <issue>22</issue>
          <fpage>2241</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare12222241"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare12222241</pub-id>
          <pub-id pub-id-type="medline">39595439</pub-id>
          <pub-id pub-id-type="pii">healthcare12222241</pub-id>
          <pub-id pub-id-type="pmcid">PMC11594149</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fatima</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shafique</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fadlalla Ahmed</surname>
              <given-names>TK</given-names>
            </name>
            <name name-style="western">
              <surname>Mustafa</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in medicine: a cross-disciplinary systematic review of ChatGPT's (artificial intelligence) role in research, clinical practice, education, and patient interaction</article-title>
          <source>Medicine (Baltimore)</source>
          <year>2024</year>
          <volume>103</volume>
          <issue>32</issue>
          <fpage>e39250</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1097/MD.0000000000039250"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MD.0000000000039250</pub-id>
          <pub-id pub-id-type="medline">39121303</pub-id>
          <pub-id pub-id-type="pii">00005792-202408090-00060</pub-id>
          <pub-id pub-id-type="pmcid">PMC11315549</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Page</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>McKenzie</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Bossuyt</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Boutron</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffmann</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Mulrow</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Shamseer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tetzlaff</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Akl</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Brennan</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Glanville</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Grimshaw</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Hróbjartsson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lalu</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Loder</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Mayo-Wilson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McGuinness</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tricco</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Welch</surname>
              <given-names>VA</given-names>
            </name>
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Moher</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The PRISMA 2020 statement: an updated guideline for reporting systematic reviews</article-title>
          <source>Syst Rev</source>
          <year>2021</year>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>89</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-021-01626-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-021-01626-4</pub-id>
          <pub-id pub-id-type="medline">33781348</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-021-01626-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC8008539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>W-Y</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>S-Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J-H</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>B-W</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Gen-SynDi: leveraging knowledge-guided generative AI for dual education of syndrome differentiation and disease diagnosis</article-title>
          <source>Applied Sciences</source>
          <year>2025</year>
          <volume>15</volume>
          <issue>9</issue>
          <fpage>4862</fpage>
          <pub-id pub-id-type="doi">10.3390/app15094862</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haider</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Prabha</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez-Cabello</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Borna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Genovese</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Trabilsy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Collaco</surname>
              <given-names>BG</given-names>
            </name>
            <name name-style="western">
              <surname>Wood</surname>
              <given-names>NG</given-names>
            </name>
            <name name-style="western">
              <surname>Bagaria</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Forte</surname>
              <given-names>AJ</given-names>
            </name>
          </person-group>
          <article-title>Synthetic patient-physician conversations simulated by large language models: a multi-dimensional evaluation</article-title>
          <source>Sensors (Basel)</source>
          <year>2025</year>
          <volume>25</volume>
          <issue>14</issue>
          <fpage>4305</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=s25144305"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/s25144305</pub-id>
          <pub-id pub-id-type="medline">40732431</pub-id>
          <pub-id pub-id-type="pii">s25144305</pub-id>
          <pub-id pub-id-type="pmcid">PMC12298718</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Exploring the inquiry-diagnosis relationship with advanced patient simulators</article-title>
          <source>ArXiv. Preprint posted online on March 11,</source>
          <year>2025</year>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Du</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ying</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Llms can simulate standardized patients via agent coevolution</article-title>
          <source>ArXiv. Preprint posted online on June 7,</source>
          <year>2025</year>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saab</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Freyberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Strother</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>W-H</given-names>
            </name>
            <name name-style="western">
              <surname>Barrett</surname>
              <given-names>DGT</given-names>
            </name>
            <name name-style="western">
              <surname>Stutz</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Palepu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liévin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ruparel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vedadi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kanada</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hughes</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Manyika</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hassidim</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>DR</given-names>
            </name>
            <name name-style="western">
              <surname>Kohli</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Eslami</surname>
              <given-names>SMA</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J-K</given-names>
            </name>
            <name name-style="western">
              <surname>Rodman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Schaekermann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tanno</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Advancing conversational diagnostic AI with multimodal reasoning</article-title>
          <source>ArXiv. Preprint posted online on May 6,</source>
          <year>2025</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2505.04653</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yamamoto</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Koda</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ogawa</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Miyoshi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Maeda</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Otsuka</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ino</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Enhancing medical interview skills through AI-simulated patient interactions: nonrandomized controlled trial</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e58753</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e58753/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/58753</pub-id>
          <pub-id pub-id-type="medline">39312284</pub-id>
          <pub-id pub-id-type="pii">v10i1e58753</pub-id>
          <pub-id pub-id-type="pmcid">PMC11459107</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schaekermann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Palepu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Saab</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Freyberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tanno</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Amin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Vedadi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Webson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kulkarni</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Towards conversational diagnostic artificial intelligence</article-title>
          <source>Nature</source>
          <year>2025</year>
          <volume>642</volume>
          <issue>8067</issue>
          <fpage>442</fpage>
          <lpage>450</lpage>
          <pub-id pub-id-type="doi">10.1038/s41586-025-08866-7</pub-id>
          <pub-id pub-id-type="medline">40205050</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-025-08866-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC12158756</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Holderried</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Stegemann-Philipps</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Herschbach</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Moldt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nevins</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Griewatz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Holderried</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Herrmann-Werner</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Festl-Wietek</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mahling</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A Generative Pretrained Transformer (GPT)-powered chatbot as a simulated patient to practice history taking: prospective, mixed methods study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e53961</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e53961/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/53961</pub-id>
          <pub-id pub-id-type="medline">38227363</pub-id>
          <pub-id pub-id-type="pii">v10i1e53961</pub-id>
          <pub-id pub-id-type="pmcid">PMC10828948</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Leveraging large language model as simulated patients for clinical education</article-title>
          <source>ArXiv. Preprint posted online on April 25,</source>
          <year>2024</year>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Laverde</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Grévisse</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jaramillo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Manrique</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Integrating large language model-based agents into a virtual patient chatbot for clinical anamnesis training</article-title>
          <source>Comput Struct Biotechnol J</source>
          <year>2025</year>
          <volume>27</volume>
          <fpage>2481</fpage>
          <lpage>2491</lpage>
          <pub-id pub-id-type="doi">10.1016/j.csbj.2025.05.025</pub-id>
          <pub-id pub-id-type="medline">40547455</pub-id>
          <pub-id pub-id-type="pii">S2001-0370(25)00185-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC12180958</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rose</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hung</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lepri</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alqassem</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Gashteovski</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lawrence</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Meddxagent: a unified modular agent framework for explainable automatic differential diagnosis</article-title>
          <source>ArXiv. Preprint posted online on February 26,</source>
          <year>2025</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2502.19175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gattani</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Enhancing medical history collection using llms</article-title>
          <year>2024</year>
          <conf-name>ACSW '24: Proceedings of the 2024 Australasian Computer Science Week</conf-name>
          <conf-date>2024 May 13</conf-date>
          <conf-loc>NSW, Sydney, Australia</conf-loc>
          <fpage>140</fpage>
          <lpage>143</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Seo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>LLM-Based clinical history taking system: a persona-driven approach</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2025</year>
          <volume>329</volume>
          <fpage>1866</fpage>
          <lpage>1867</lpage>
          <pub-id pub-id-type="doi">10.3233/SHTI251254</pub-id>
          <pub-id pub-id-type="medline">40776271</pub-id>
          <pub-id pub-id-type="pii">SHTI251254</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takata</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yamada</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>René</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Development of a virtual patient model for Kampo medical interview: new approach for enhancing empathy and understanding of Kampo medicine pathological concepts</article-title>
          <year>2024</year>
          <conf-name>2024 Joint 13th International Conference on Soft Computing and Intelligent Systems and 25th International Symposium on Advanced Intelligent Systems (SCIS&#38;ISIS)</conf-name>
          <conf-date>2024 November 01</conf-date>
          <conf-loc>Himeji, Japan</conf-loc>
          <fpage>1</fpage>
          <lpage>5</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Benfatah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Marfak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Saad</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hilali</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nejjari</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Youlyouz-Marfak</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Assessing the efficacy of ChatGPT as a virtual patient in nursing simulation training: a study on nursing students' experience</article-title>
          <source>Teach Learn Nurs</source>
          <year>2024</year>
          <volume>19</volume>
          <issue>3</issue>
          <fpage>e486</fpage>
          <lpage>e493</lpage>
          <pub-id pub-id-type="doi">10.1016/j.teln.2024.02.005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rashidian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Amar</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pugh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Masterson</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cha</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Vaid</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>AI agents for conversational patient triage: preliminary simulation-based evaluation with real-world EHR data</article-title>
          <source>ArXiv. Preprint posted online on June 4,</source>
          <year>2025</year>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kenny</surname>
              <given-names>PG</given-names>
            </name>
            <name name-style="western">
              <surname>Parsons</surname>
              <given-names>TD</given-names>
            </name>
          </person-group>
          <article-title>Virtual standardized llm-ai patients for clinical practice</article-title>
          <source>Annual Review of Cybertherapy And Telemedicine</source>
          <year>2024</year>
          <volume>22</volume>
          <fpage>177</fpage>
          <lpage>182</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Milani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Eack</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Labrum</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hardy</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>ZZ</given-names>
            </name>
          </person-group>
          <article-title>PATIENT-Ψ: using large language models to simulate patients for training mental health professionals</article-title>
          <source>ArXiv. Preprint posted online on October 3,</source>
          <year>2024</year>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>LLM-empowered chatbots for psychiatrist and patient simulation: application and evaluation</article-title>
          <source>ArXiv. Preprint posted online on May 23,</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2305.13614</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Staples</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Clarke</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Leininger</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Principato</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shafiei</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goodwin</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Building virtual patients for training mental health professionals</article-title>
          <source>PsyArXiv</source>
          <year>2025</year>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ryu</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Kwon</surname>
              <given-names>CY</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Jeon</surname>
              <given-names>HL</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>HJ</given-names>
            </name>
          </person-group>
          <article-title>Development and application of a chatgpt-based simulation training platform for korean medicine</article-title>
          <source>Journal of Oriental Neuropsychiatry</source>
          <year>2024</year>
          <volume>35</volume>
          <issue>4</issue>
          <fpage>413</fpage>
          <lpage>427</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rädel-Ablass</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schliz</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schlick</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Meindl</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Pahr-Hosbach</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schwendemann</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Rupp</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Roddewig</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Miersch</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Teaching opportunities for anamnesis interviews through AI based teaching role plays: a survey with online learning students from health study programs</article-title>
          <source>BMC Med Educ</source>
          <year>2025</year>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>259</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-025-06756-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-025-06756-0</pub-id>
          <pub-id pub-id-type="medline">39966894</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-025-06756-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC11834289</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bodonhelyi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stegemann-Philipps</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sonanini</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Herschbach</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Szép</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Herrmann-Werner</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Modeling challenging patient interactions: LLMs for medical communication training</article-title>
          <source>Preprint posted online on April 8,</source>
          <year>2025</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2503.22250</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Leypold</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schäfer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Boos</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Beier</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence-powered hand surgery consultation: GPT-4 as an assistant in a hand surgery outpatient clinic</article-title>
          <source>J Hand Surg Am</source>
          <year>2024</year>
          <volume>49</volume>
          <issue>11</issue>
          <fpage>1078</fpage>
          <lpage>1088</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0363-5023(24)00261-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jhsa.2024.06.002</pub-id>
          <pub-id pub-id-type="medline">39066762</pub-id>
          <pub-id pub-id-type="pii">S0363-5023(24)00261-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brügge</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ricchizzi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Arenbeck</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Keller</surname>
              <given-names>MN</given-names>
            </name>
            <name name-style="western">
              <surname>Schur</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stummer</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Holling</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Darici</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Large language models improve clinical decision making of medical students through patient simulation and structured feedback: a randomized controlled trial</article-title>
          <source>BMC Med Educ</source>
          <year>2024</year>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>1391</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-024-06399-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-024-06399-7</pub-id>
          <pub-id pub-id-type="medline">39609823</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-024-06399-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC11605890</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Georg</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jobs</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Huss</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Waldenlind</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ruiz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Edelbring</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Skantze</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Parodis</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Virtual patient simulations using social robotics combined with large language models for clinical reasoning training in medical education: mixed methods study</article-title>
          <source>J Med Internet Res</source>
          <year>2025</year>
          <volume>27</volume>
          <fpage>e63312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2025//e63312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/63312</pub-id>
          <pub-id pub-id-type="medline">40053778</pub-id>
          <pub-id pub-id-type="pii">v27i1e63312</pub-id>
          <pub-id pub-id-type="pmcid">PMC11914843</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Parodis</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Skantze</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Creating virtual patients using robots and large language models: a preliminary study with medical students</article-title>
          <year>2024</year>
          <conf-name>HRI '24: Companion of the 2024 ACM/IEEE International Conference on Human-Robot Interaction</conf-name>
          <conf-date>2024 March 11 - 15</conf-date>
          <conf-loc>Boulder CO USA</conf-loc>
          <fpage>273</fpage>
          <lpage>277</lpage>
          <pub-id pub-id-type="doi">10.1145/3610978.3640592</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tsui</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A large language model digital patient system enhances ophthalmology history taking skills</article-title>
          <source>NPJ Digit Med</source>
          <year>2025</year>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>502</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-025-01841-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-025-01841-6</pub-id>
          <pub-id pub-id-type="medline">40760042</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-025-01841-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC12322286</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Automatic interactive evaluation for large language models with state aware patient simulator</article-title>
          <source>ArXiv. Preprint posted online on July 21,</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.2139/ssrn.4890649</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johri</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>BA</given-names>
            </name>
            <collab>et al</collab>
          </person-group>
          <article-title>CRAFT-MD: A conversational evaluation framework for comprehensive assessment of clinical LLMs</article-title>
          <year>2024</year>
          <conf-name>AAAI 2024 Spring Symposium on Clinical Foundation Models</conf-name>
          <conf-date>March 25-27, 2024</conf-date>
          <conf-loc>Stanford University, Stanford, California</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Driess</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schaekermann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Amin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lau</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tanno</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ktena</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Palepu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mustafa</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kornblith</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fleet</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Prakash</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Virmani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dominowska</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Arcas</surname>
              <given-names>BAY</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Florence</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Towards generalist biomedical AI</article-title>
          <source>NEJM AI</source>
          <year>2024</year>
          <volume>642</volume>
          <issue>3</issue>
          <fpage>442</fpage>
          <lpage>450</lpage>
          <pub-id pub-id-type="doi">10.1056/aioa2300138</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rodrigo</surname>
              <given-names>MMT</given-names>
            </name>
            <name name-style="western">
              <surname>Castaneda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez</surname>
              <given-names>RB</given-names>
            </name>
            <name name-style="western">
              <surname>Alaan</surname>
              <given-names>JAMV</given-names>
            </name>
            <name name-style="western">
              <surname>Caoile</surname>
              <given-names>PSP</given-names>
            </name>
            <name name-style="western">
              <surname>Chidrome</surname>
              <given-names>KD</given-names>
            </name>
          </person-group>
          <article-title>Project caladrius: The design of a virtual patient for philippine medical education</article-title>
          <year>2025</year>
          <conf-name>7th International Conference, AIS 2025, Held as Part of the 27th HCI International Conference, HCII</conf-name>
          <conf-date>2025 June 22–27</conf-date>
          <conf-loc>Gothenburg, Sweden</conf-loc>
          <fpage>232</fpage>
          <lpage>244</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-031-92970-0_17</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Meddit: A knowledge-controlled diffusion transformer framework for dynamic medical image generation in virtual simulated patient</article-title>
          <source>ArXiv. Preprint posted online on August 22,</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.24963/ijcai.2024/1267</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Turner</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kropczynski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ozer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Halse</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Llm-as-a-fuzzy-judge: Fine-tuning large language models as a clinical evaluation judge with fuzzy logic</article-title>
          <source>ArXiv. Preprint posted online on June 12,</source>
          <year>2025</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2506.11221</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Koh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Foong</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Real-time hybrid language model for virtual patient conversations</article-title>
          <year>2023</year>
          <conf-name>International Conference on Artificial Intelligence in Education</conf-name>
          <conf-date>2023 3 July</conf-date>
          <conf-loc>Tokyo, Japan</conf-loc>
          <fpage>780</fpage>
          <lpage>785</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-031-36272-9_71</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Overgaard</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pankratz</surname>
              <given-names>VS</given-names>
            </name>
            <name name-style="western">
              <surname>Del Fiol</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Aakre</surname>
              <given-names>CA</given-names>
            </name>
          </person-group>
          <article-title>Virtual patients using large language models: scalable, contextualized simulation of clinician-patient dialogue with feedback</article-title>
          <source>J Med Internet Res</source>
          <year>2025</year>
          <volume>27</volume>
          <fpage>e68486</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2025//e68486/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/68486</pub-id>
          <pub-id pub-id-type="medline">39854611</pub-id>
          <pub-id pub-id-type="pii">v27i1e68486</pub-id>
          <pub-id pub-id-type="pmcid">PMC12008702</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thesen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>O’Brien</surname>
              <given-names>WN</given-names>
            </name>
            <name name-style="western">
              <surname>Stone</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pinto-Powell</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Generative AI as the first patient: practice, feedback, and confidence</article-title>
          <source>Med Sci Educ</source>
          <year>2025</year>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1007/s40670-025-02473-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AEW</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <volume>3</volume>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.35"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fansi</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Goel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Martel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosn</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>DDXPlus: a new dataset for automatic medical diagnosis</article-title>
          <source>ArXiv. Preprint posted online on October 13,</source>
          <year>2022</year>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Agent hospital: a simulacrum of hospital with evolvable medical agents</article-title>
          <source>ArXiv. Preprint posted online on January 17,</source>
          <year>2025</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2405.02957</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>RareBench: can LLMs serve as rare diseases specialists?</article-title>
          <year>2024</year>
          <conf-name>KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>2024 August 25 - 29</conf-date>
          <conf-loc>Barcelona Spain</conf-loc>
          <fpage>4850</fpage>
          <lpage>4861</lpage>
          <pub-id pub-id-type="doi">10.1145/3637528.3671576</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Overview of the CCKS 2019 knowledge graph evaluation trackntity, relation, event and QA</article-title>
          <source>ArXiv. Preprint posted online on March 9,</source>
          <year>2020</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2003.03875</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kohli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shooshan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Antani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Thoma</surname>
              <given-names>GR</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Preparing a collection of radiology examinations for distribution and retrieval</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2016</year>
          <volume>23</volume>
          <issue>2</issue>
          <fpage>304</fpage>
          <lpage>310</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/26133894"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv080</pub-id>
          <pub-id pub-id-type="medline">26133894</pub-id>
          <pub-id pub-id-type="pii">ocv080</pub-id>
          <pub-id pub-id-type="pmcid">PMC5009925</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Altosaar</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ranganath</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Clinicalbert: modeling clinical notes and predicting hospital readmission</article-title>
          <source>ArXiv. Preprint posted online on November 29,</source>
          <year>2020</year>
          <pub-id pub-id-type="doi">10.5860/choice.189890</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Steenstra</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Nouraei</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Bickmore</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Scaffolding empathy: training counselors with simulated patients and utterance-level performance visualizations</article-title>
          <year>2025</year>
          <conf-name>CHI '25: Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems</conf-name>
          <conf-date>2025 1 May</conf-date>
          <conf-loc>Yokohama Japan</conf-loc>
          <fpage>1</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="doi">10.1145/3706598.3714014</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bodenreider</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>The unified medical language system (UMLS): integrating biomedical terminology</article-title>
          <source>Nucleic Acids Res</source>
          <year>2004</year>
          <volume>32</volume>
          <issue>Database issue</issue>
          <fpage>D267</fpage>
          <lpage>D270</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/14681409"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkh061</pub-id>
          <pub-id pub-id-type="medline">14681409</pub-id>
          <pub-id pub-id-type="pii">32/suppl_1/D267</pub-id>
          <pub-id pub-id-type="pmcid">PMC308795</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Hole</surname>
              <given-names>WT</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Srinivasan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Powell</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Roth</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Integrating SNOMED CT into the UMLS: an exploration of different views of synonymy and quality of editing</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2005</year>
          <volume>12</volume>
          <issue>4</issue>
          <fpage>486</fpage>
          <lpage>494</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/15802483"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M1767</pub-id>
          <pub-id pub-id-type="medline">15802483</pub-id>
          <pub-id pub-id-type="pii">M1767</pub-id>
          <pub-id pub-id-type="pmcid">PMC1174894</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hewitt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Paranjape</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bevilacqua</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Petroni</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Lost in the middle: how language models use long contexts</article-title>
          <source>ArXiv. Preprint posted online on November 20,</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00638</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
