<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e73857</article-id><article-id pub-id-type="doi">10.2196/73857</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Enhancing Large Language Models With AI Agents for Chronic Gastritis Management: Comprehensive Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Shurui</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ye</surname><given-names>Qing</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Tongji Hospital, Tongji Medical College, Huazhong University of Science and Technology</institution><addr-line>1095 Jiefang Ave</addr-line><addr-line>Wuhan</addr-line><addr-line>Hubei</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Kueper</surname><given-names>Jacqueline</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Immanuvel Arockiasamy</surname><given-names>Jesu Marcus</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lin</surname><given-names>Kuan-Hsun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ramos</surname><given-names>Mayk Caldas</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Qing Ye, PhD, Tongji Hospital, Tongji Medical College, Huazhong University of Science and Technology, 1095 Jiefang Ave, Wuhan, Hubei, 430000, China, +86 188-0276-3109; <email>qye@tjh.tjmu.edu.cn</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>13</day><month>11</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e73857</elocation-id><history><date date-type="received"><day>13</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>10</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>13</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Shurui Wang, Qing Ye. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 13.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e73857"/><abstract><sec><title>Background</title><p>The prevalence of chronic gastritis is high, and if not intervened in a timely manner, it may eventually lead to gastric cancer. Managing chronic gastritis essentially requires comprehensive lifestyle changes. However, the current health care environment does not support continuous follow-up by professional health care providers, making self-management a key component of postdiagnosis care. Increasingly, researchers are exploring the use of large language models (LLMs) for patient management. However, LLMs have limitations, including hallucinations, limited knowledge scope, and lack of timeliness. Artificial intelligence (AI) agents may provide a more effective solution. Nevertheless, it remains uncertain whether AI agents can effectively support postdiagnosis self-management for patients with chronic gastritis.</p></sec><sec><title>Objective</title><p>The purpose of this study was to explore the effectiveness of AI agents in the postdiagnosis management of patients with chronic gastritis from different perspectives.</p></sec><sec sec-type="methods"><title>Methods</title><p>In this study, we developed an agent framework for the health management of patients with chronic gastritis based on LLMs in conjunction with retrieval-augmented generation and a search engine tool. We collected real questions from patients with chronic gastritis in clinical settings and tested the framework&#x2019;s performance across different difficulty levels and scenarios. We analyzed its safety and robustness and compared it with state-of-the-art models to comprehensively evaluate its effectiveness.</p></sec><sec sec-type="results"><title>Results</title><p>Using a dual-evaluation framework comprising automated metrics and expert manual assessments, our results demonstrated that AI agents substantially outperformed LLMs in addressing high-complexity questions (embedding average score: 82.849 for AI agents vs 77.825 for LLMs) and were particularly effective in clinical consultation tasks. Clinical evaluation of safety based on a 5-point Likert scale by physicians indicated that the safety of the agents was 4.98 (SD 0.15; 95% CI 4.96-4.99). After 30 repeated experiments, the mean absolute deviation of the AI agents in the embedding average score and BERTScore metrics were 0.0167 and 0.0387, respectively. Therefore, the safety and robustness analysis confirmed that the AI agents can produce safe, stable, and minimally variable responses. In addition, comparative results with those of advanced medical-domain LLMs (Baichuan-14B-M1 and MedGemma-27B) and general-domain LLMs (Qwen3-32B) also demonstrated that the AI agents in this study performed outstandingly in the field of chronic gastritis. Our findings underscore the superior reliability, interpretability, and practical applicability of AI agents over conventional LLMs in chronic gastritis management, offering a robust foundation for their broader adoption in health care settings.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>AI agents based on LLMs have high application value in the management of chronic gastritis. They can effectively guide patients with chronic diseases in addressing common issues, which may potentially reduce the workload of physicians and improve the quality of patient home care.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>artificial intelligence agent</kwd><kwd>AI agent</kwd><kwd>chronic gastritis</kwd><kwd>health management</kwd><kwd>retrieval-augmented generation</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Chronic gastritis is one of the most common disorders in the digestive system and is also the initial phase in the progression to gastric cancer [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. It is characterized by insidious onset, a protracted disease course, high prevalence, and frequent recurrence, as well as substantial health care costs, all of which significantly compromise patients&#x2019; quality of life [<xref ref-type="bibr" rid="ref4">4</xref>]. Data show that nearly half of patients with chronic atrophic gastritis experience anxiety [<xref ref-type="bibr" rid="ref5">5</xref>], and prolonged anxiety is also identified as a risk factor for the exacerbation of chronic gastritis. The management of chronic gastritis inherently requires comprehensive lifestyle modifications, and self-management has emerged as a critical component in chronic gastritis care. Patients can achieve not only effective symptom control but also meaningful improvement in overall health-related quality of life through systematic self-monitoring, evidence-based lifestyle adjustments, and structured psychological support, ultimately progressing toward holistic wellness.</p><p>Current constraints within the health care system render continuous postdiagnosis management by medical professionals impractical and unsustainable. The substantial heterogeneity in patient demographics further compounds clinical workload burdens. Particularly in resource-limited settings and geographically remote areas with uneven health care distribution [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], temporal and financial constraints make regular in-person clinical follow-ups largely unfeasible. While patient self-management presents a viable strategy to mitigate these resource limitations, the selection of appropriate self-care modalities remains paramount. Inappropriate information-seeking behaviors may lead to the acquisition of erroneous medical knowledge, potentially yielding adverse clinical outcomes. For instance, exclusive reliance on search engines for medical guidance is problematic due to the absence of professional interaction, where inaccuracies in symptom interpretation or medical misinformation may precipitate serious consequences [<xref ref-type="bibr" rid="ref8">8</xref>]. Online consultations in the internet era provide convenience for patient inquiries. However, patients often express concerns about the protection of their personal privacy and a lack of trust in the professionalism of online services [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. In addition, the uncertainty surrounding patients&#x2019; online medical needs and the availability of physicians is highly likely to intensify their hesitation and resistance toward online consultations.</p><p>Since 2018, when OpenAI introduced the first generative pretrained transformer model, GPT-1 [<xref ref-type="bibr" rid="ref11">11</xref>], large language models (LLMs) have ushered in a golden age. In recent years, the surge in the development of LLMs has inevitably sparked transformative changes in the medical field [<xref ref-type="bibr" rid="ref12">12</xref>]. Ayers et al [<xref ref-type="bibr" rid="ref13">13</xref>] compared the performance of ChatGPT with that of physicians in responding to patient inquiries on social media. The results indicated that the responses generated by ChatGPT were of higher quality and received greater patient approval. In addition, LLMs are capable of maintaining continuous communication with patients around the clock, a level of availability that is nearly impossible for human responders to achieve and difficult to improve upon. Furthermore, LLMs can address sensitive questions posed by patients, which are often challenging to broach or are met with hesitation during face-to-face consultations.</p><p>However, complete reliance on LLMs also carries certain risks as the issue of hallucinations cannot be overlooked, particularly in the health care domain. As hallucinations cannot be entirely eliminated, we can strive to minimize their occurrence. Research has shown that the agent framework [<xref ref-type="bibr" rid="ref14">14</xref>] can significantly reduce the hallucination rate [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Meanwhile, the knowledge of LLMs is derived from their pretraining data, which inherently have limitations such as restricted scope and lack of timeliness [<xref ref-type="bibr" rid="ref17">17</xref>]. To better address the complex and dynamic management needs of patients with chronic gastritis, an agent system built upon LLMs but not confined to the existing content in the training data may offer a more effective solution. As LLMs demonstrate remarkable capabilities and attract widespread attention, an increasing number of researchers are leveraging these models to develop artificial intelligence (AI) agent systems [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. Agents have been proven to possess capabilities beyond those of LLMs [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Nevertheless, whether agents can be effectively used for the management of patients with chronic gastritis remains uncertain.</p><p>Our research specifically designed a question-and-answer (Q&#x0026;A) dataset for chronic gastritis and analyzed the feasibility of applying agents in the medical field. Furthermore, by categorizing health management tasks, problem complexity, and the scale of LLMs, we compared the content of responses generated by LLMs and agents. This study aimed to provide guidance for the application of AI agents in the medical field and offer practical AI tools for the postdiagnosis management of patients with chronic gastritis.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>The effectiveness of chronic gastritis treatment depends to some extent on the patient&#x2019;s educational background, living environment, and personal habits. The actual circumstances of patients vary, leading to differences in the complexity of the questions they raise. In addition, prompts can significantly influence the performance of LLMs [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], and LLMs of different scales exhibit varying capabilities in processing prompts. Therefore, taking into account the characteristics of the disease, the scale of the model, and the needs of patients, we aimed to explore the following research questions:</p><list list-type="order"><list-item><p>In the context of chronic gastritis management, does a larger LLM lead to better response content?</p></list-item><list-item><p>In chronic gastritis management, with a consistent parameter size, does an agent outperform an LLM?</p></list-item><list-item><p>How does the response content of Q&#x0026;A models change as the difficulty of the questions increases?</p></list-item><list-item><p>Across different health management task scenarios, does an agent consistently outperform an LLM?</p></list-item></list><p>To address the aforementioned questions, we propose the methodological framework illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>. On the basis of health management task scenarios (lifestyle intervention, medication guidance, and clinical consultation) and problem complexity (level 1, level 2, and level 3; more details can be found in the Preparation of the Q&#x0026;A Dataset section), questions were categorized into 9 (3 &#x00D7; 3) dimensions. These questions were then tested using 3 scales of LLMs and agents (using retrieval-augmented generation [RAG] and a search engine tool; more details can be found in the Use of LLMs and Agents section) on real-world problems. The responses were evaluated using multiple assessment metrics (more details can be found in the Model Evaluation section), and the performance of the models was analyzed across various aspects, including model size, methodology, task scenarios, and problem difficulty. To ensure the safety of the generated answers, we assessed all responses for safety, mitigating potential risks associated with the experimental outcomes. Finally, to enhance the robustness of the model, we conducted multiple tests on the same set of questions to confirm the stability of the results.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study framework for the application of large language models (LLMs) in chronic gastritis management. ARC: Alpha Readability Chinese; NLP: natural language processing; RAG: retrieval-augmented generation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73857_fig01.png"/></fig></sec><sec id="s2-2"><title>Preparation of the Q&#x0026;A Dataset</title><p>Q&#x0026;A design was one of the most critical aspects of this study. The real-world questions selected must be representative. We classified the difficulty levels of the patient questions based on the cutting-edge methods proposed by Microsoft [<xref ref-type="bibr" rid="ref26">26</xref>]. Microsoft&#x2019;s hierarchical approach is designed to identify pathways for finding answers and is tailored for general domains, which does not fully align with the classification of problem difficulty in medical scenarios. Building on this, we collaborated with clinical experts to define 3 difficulty levels appropriate for medical scenarios: low-difficulty questions (level 1; based on explicit facts), medium-difficulty questions (level 2; based on implicit facts), and high-difficulty questions (level 3; requiring reasonable inference). <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the rules for categorizing question difficulty.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Criteria for question difficulty classification. (A) Low-difficulty questions, which can be answered based on explicit facts, primarily require the model to locate and extract relevant information. (B) Medium-difficulty questions, which require implicit facts to derive the answer, may involve information scattered across multiple segments. (C) High-difficulty questions, where no explicit answer exists in any information segment, demand complex reasoning to uncover latent information. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73857_fig02.png"/></fig></sec><sec id="s2-3"><title>Use of LLMs and Agents</title><p>As the models in the Qwen series have been proven to have excellent application performance [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], we selected the &#x201C;instruct&#x201D; models from the Qwen2 series with model sizes of 1.5B, 7B, and 72B as the foundational models (Qwen-1.5B, Qwen-7B, and Qwen-72B models) [<xref ref-type="bibr" rid="ref29">29</xref>]. In 2020, Lewis et al [<xref ref-type="bibr" rid="ref30">30</xref>] formally proposed and named the RAG framework, marking the birth of a new paradigm. Research has shown that, with the support of external knowledge, &#x201C;instruct&#x201D; models can generate more accurate and higher-quality responses [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Using domain-specific knowledge and external tools, we applied instruction prompting to these models to improve their specialization in the management of chronic gastritis. In addition, an agent was constructed based on the LLM framework, enabling the models to provide more professional responses to questions from patients with chronic gastritis. To enhance the capabilities of the agent, we used RAG and integrated search engine tools to prepare contextual information. The detailed process of RAG and the visualized integration of its components, as well as the prompt templates, are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Our preparatory work can be divided into the following components:</p><list list-type="bullet"><list-item><p>Knowledge base: in light of the updates in clinical techniques, we conducted a search in the <italic>National Medical Journal of China</italic> for Chinese expert consensuses and clinical guidelines published from 2020 to 2024 using &#x201C;gastritis&#x201D; as the keyword. Subsequently, we invited experts to manually screen and select 18 articles related to chronic gastritis. Using the zh_core_web_sm model, we segmented the long texts into sentences and further divided these sentences into chunks using a sliding window approach. Each chunk contained 20 sentences, with an overlap of 5 sentences between adjacent chunks to maintain contextual coherence. Subsequently, the all-MiniLM-L6-v2 model was used to map these chunks into a vector space, converting them into embedding vectors. Finally, using the Euclidean distance as the similarity metric, we created a Faiss index and added the embedding vectors to this index to enable rapid retrieval of the most similar vectors for a given query text.</p></list-item><list-item><p>Agent tools: a defining feature of an agent, as opposed to a stand-alone LLM, is its ability to flexibly use external tools. In this process, we invoked the Google Search application programming interface to retrieve URLs and content summaries related to the input query. The summary texts were then ranked based on relevance to produce the final search results.</p></list-item><list-item><p>Agent (with RAG and search tools): on the basis of the Faiss index and L2 distance, we retrieved the 5 most relevant text fragments from the knowledge base. The Google Search application programming interface sorted the organic results according to its internal ranking algorithm, and the 5 most relevant web page snippets were directly output. These 10 text fragments were then passed to the model as reference materials, whereas the decision of whether to use this reference content was autonomously made by the agents. Three agents were constructed based on the Qwen-1.5B, Qwen-7B, and Qwen-72B models. These agents shared identical network parameters, which were configured as follows: the maximum number of tokens was set to 800, the temperature was set to 0.4, the nucleus sampling parameter was set to 0.8, the truncation parameter was set to 5, and the frequency penalty was set to 0.5.</p></list-item></list></sec><sec id="s2-4"><title>Model Evaluation</title><p>Relying on a single perspective for evaluation metrics may lead to biased results. To address this, we used 3 types of evaluation methods: natural language processing&#x2013;based automatic evaluation metrics, manual scoring, and Chinese lexical evaluation metrics. The automatic evaluation metrics compared the system-generated answers with the standard answers (more details are provided in the Preparation of the Q&#x0026;A Dataset section) to derive evaluation results. Among these, the embedding average score and BERTScore use cosine similarity between vectors to calculate the relevance to the standard answers, which has proven to be an effective approach [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref34">34</xref>]. In light of the unique nature of medical scenarios, we also designed a Likert scale and organized a panel comprising 5 clinical experts in relevant fields (including 1 certified clinical nutritionist), all of whom have over 10 years of work experience, to manually score the model-generated answers based on the following criteria [<xref ref-type="bibr" rid="ref35">35</xref>]:</p><list list-type="bullet"><list-item><p>Accuracy: the response contained specific and precise information rather than general or generic information.</p></list-item><list-item><p>Completeness: the response covered all relevant medical information and details, with no omission of critical content.</p></list-item><list-item><p>Clarity: the response was clearly expressed, easy to understand, readable, and free of ambiguity.</p></list-item><list-item><p>Relevance: the response was closely related to the question, providing useful information and recommendations that aligned closely with the patient&#x2019;s needs.</p></list-item><list-item><p>Continuity: the response maintained coherent and consistent phrasing throughout, with no logical jumps or inconsistencies.</p></list-item></list><p>The experts scored the responses generated by each model using each method (3 &#x00D7; 2 &#x00D7; 63) on a scale ranging from 1 to 5, where a higher score indicated better performance. In addition, we used a specialized evaluation framework designed for Chinese text to assess the richness and clarity of syntax and vocabulary [<xref ref-type="bibr" rid="ref36">36</xref>]. This framework was originally developed for general Chinese contexts, but its evaluation metrics can be adapted and interpreted more granularly for medical scenarios:</p><list list-type="bullet"><list-item><p>Lexical richness: the entropy values of all the words were calculated. The higher the entropy value, the greater the uncertainty associated with the words used in the text. This indicates a more diverse vocabulary, which in turn increases the reading difficulty level of the text.</p></list-item><list-item><p>Syntactic richness: the entropy values of all the dependencies in the text were calculated. The higher the entropy value, the greater the uncertainty in the text&#x2019;s dependency relationships or syntactic structure. This suggests a more complex and varied syntax, which in turn makes the text more difficult to read.</p></list-item><list-item><p>Semantic clarity: the semantic clarity value was calculated based on the skewness of the topic distribution probability extracted through latent Dirichlet allocation topic modeling [<xref ref-type="bibr" rid="ref37">37</xref>]. The higher this value, the more concentrated the text&#x2019;s topics represented by nouns, resulting in clearer semantics.</p></list-item><list-item><p>Semantic noise: the semantic noise value was calculated through the kurtosis of the topic distribution probability extracted through latent Dirichlet allocation topic modeling [<xref ref-type="bibr" rid="ref37">37</xref>]. The higher this value, the more the text&#x2019;s topics represented by nouns are skewed toward unimportant topics, thereby increasing the semantic noise.</p></list-item><list-item><p>Semantic richness: on the basis of the research by Lee et al [<xref ref-type="bibr" rid="ref37">37</xref>], the semantic richness of the text was calculated by summing the probabilities of the occurrence of nouns in the text. The higher the value, the richer the topics of the text, and the lower its readability.</p></list-item></list></sec><sec id="s2-5"><title>Ethical Considerations</title><p>This study aimed solely to evaluate the quality of responses generated by an AI agent for chronic gastritis queries and was therefore classified as nonhuman-participant research; institutional review board approval and full ethics review were waived. All patient questions were gathered during routine clinical operations, and no identifiable personal information was collected. Consequently, informed consent was not required and no compensation was provided. All analyses were performed on a secure server to safeguard the privacy and confidentiality of the questions submitted by patients.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Influence of Model Scales and Methods on Response Content</title><sec id="s3-1-1"><title>Model Scale Comparison</title><p>The number of parameters is one of the primary factors influencing the performance of LLMs [<xref ref-type="bibr" rid="ref38">38</xref>]. The results shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, <xref ref-type="table" rid="table1">Table 1</xref>, and Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> demonstrate that changes in model scales have a positive impact on the response content of LLMs. However, as the number of parameters increased, the improvement effect gradually diminished. Specifically, there was no statistically significant difference in response content between base models with 7B and 72B parameters. This phenomenon may be attributed to 2 reasons. First, general-purpose LLMs are suitable for answering questions in broad domains, but their actual performance in specialized fields such as medicine remains uncertain [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. Second, the 7B model has already reached a bottleneck in understanding medical questions, and as our questions were all within the specialized medical domain, merely increasing the number of parameters did not significantly enhance the quality of the response content. Therefore, we further tested the relationship between model scale and response content using an agent. The results still indicated that the Qwen-72B model was the optimal choice for a health management Q&#x0026;A model focused on chronic gastritis.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Statistical differences in evaluation indicators under different model parameters. Median differences were compared using bar charts. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73857_fig03.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Mann-Whitney <italic>U</italic> test results for evaluation indicators across different model parameters.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Indicator</td><td align="left" valign="bottom"><italic>U</italic> value</td><td align="left" valign="bottom"><italic>Z</italic> value</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Embedding average score</td></tr><tr><td align="left" valign="top">&#x2003;LLM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> 1.5B vs 7B</td><td align="left" valign="top">1376</td><td align="left" valign="top">&#x2212;2.969</td><td align="left" valign="top">.003</td></tr><tr><td align="left" valign="top">&#x2003;LLM 1.5B vs 72B</td><td align="left" valign="top">1336</td><td align="left" valign="top">&#x2212;3.164</td><td align="left" valign="top">.002</td></tr><tr><td align="left" valign="top">&#x2003;LLM 7B vs 72B</td><td align="left" valign="top">1840</td><td align="left" valign="top">&#x2212;0.705</td><td align="left" valign="top">.48</td></tr><tr><td align="left" valign="top">&#x2003;Agent 1.5B vs 7B</td><td align="left" valign="top">1165</td><td align="left" valign="top">&#x2212;3.999</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Agent 1.5B vs 72B</td><td align="left" valign="top">873</td><td align="left" valign="top">&#x2212;5.423</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Agent 7B vs 72B</td><td align="left" valign="top">1534</td><td align="left" valign="top">&#x2212;2.198</td><td align="left" valign="top">.03</td></tr><tr><td align="left" valign="top" colspan="4">BERTScore</td></tr><tr><td align="left" valign="top">&#x2003;LLM 1.5B vs 7B</td><td align="left" valign="top">1203</td><td align="left" valign="top">&#x2212;3.813</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;LLM 1.5B vs 72B</td><td align="left" valign="top">1152</td><td align="left" valign="top">&#x2212;4.062</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;LLM 7B vs 72B</td><td align="left" valign="top">1781</td><td align="left" valign="top">&#x2212;0.993</td><td align="left" valign="top">.32</td></tr><tr><td align="left" valign="top">&#x2003;Agent 1.5B vs 7B</td><td align="left" valign="top">730</td><td align="left" valign="top">&#x2212;6.121</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Agent 1.5B vs 72B</td><td align="left" valign="top">524</td><td align="left" valign="top">&#x2212;7.126</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Agent 7B vs 72B</td><td align="left" valign="top">1484</td><td align="left" valign="top">&#x2212;2.442</td><td align="left" valign="top">.01</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-1-2"><title>Comparison of LLMs and Agents</title><p>Using the Qwen-72B model as the base model, we compared the response content of the LLM and the agent. The trend chart (<xref ref-type="fig" rid="figure4">Figure 4</xref>) shows that the response quality of the agent had an upward trend compared with that of the LLM. The linguistic evaluation results (<xref ref-type="table" rid="table2">Table 2</xref>) show that the responses generated by the agent had higher values in lexical richness, syntactic richness, semantic noise, and semantic richness and a lower value in semantic clarity. These Chinese evaluation indicators are comparative metrics that do not focus on individual results and do not have a defined range of values. This indicates that the responses generated by the agent were more complex in terms of vocabulary and syntax, had a higher reading difficulty level, and covered a broader range of topics compared to the responses generated by the LLM. This demonstrates that the agent not only outperformed the LLM in terms of alignment with the standard answers but also exceeded the LLM in terms of breadth of thinking and domain-specific expertise. This is one concrete example demonstrating the effectiveness of using the tool (Google search engine).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Method comparison: large language model (LLM) and agent. The blue line represents the median trend line, whereas the green and red lines represent the quartile trend lines.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73857_fig04.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparison of a large language model (LLM) and agent in Chinese-language dimensions.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">LLM 72B</td><td align="left" valign="bottom">Agent 72B</td></tr></thead><tbody><tr><td align="left" valign="top">Lexical richness</td><td align="char" char="." valign="top">4.0707</td><td align="char" char="." valign="top">4.2950</td></tr><tr><td align="left" valign="top">Syntactic richness</td><td align="char" char="." valign="top">2.1529</td><td align="char" char="." valign="top">2.1911</td></tr><tr><td align="left" valign="top">Semantic clarity</td><td align="char" char="." valign="top">0.0707</td><td align="char" char="." valign="top">0.0643</td></tr><tr><td align="left" valign="top">Semantic noise</td><td align="char" char="." valign="top">5.8923</td><td align="char" char="." valign="top">6.3081</td></tr><tr><td align="left" valign="top">Semantic richness</td><td align="char" char="." valign="top">0.2411</td><td align="char" char="." valign="top">0.2520</td></tr></tbody></table></table-wrap></sec></sec><sec id="s3-2"><title>Influence of Question Difficulty and Medical Contexts on Answer Quality</title><sec id="s3-2-1"><title>Comparison of Response Content Across Different Difficulty Levels</title><p><xref ref-type="fig" rid="figure5">Figure 5</xref> shows the results stratified by difficulty level, which align with the previous conclusions: larger models outperformed smaller models, and the agents surpassed the LLMs. As question difficulty increased, the similarity between response content and reference answers showed a declining trend. Mann-Whitney <italic>U</italic> tests on the Embedding Average Scores of the 72B-Agent across difficulty levels revealed that the observed decreases from level 1 to level 2 and from level 2 to level 3 were statistically significant (<italic>U</italic> value=141.000, <italic>Z</italic> value=&#x2013;2.000, <italic>P</italic>=.046; <italic>U</italic> value=121.000, <italic>Z</italic> value=&#x2013;2.503, <italic>P</italic>=.01). Smaller models exhibited instability, which may be attributed to the limitations of LLMs in specialized domains, insufficient contextual comprehension by smaller models, and the negative impact of irrelevant information [<xref ref-type="bibr" rid="ref41">41</xref>]. When domain-specific knowledge obtained through RAG and search engine tools was input into smaller agents, these models struggled to accurately understand the relevant information, resulting in noisier and less reliable outputs.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>The trend in the impact of question complexity on answer quality. The points on the line represent the median of the embedding average score for all questions under that category. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73857_fig05.png"/></fig></sec><sec id="s3-2-2"><title>Comparison of Response Content From the Qwen-72B Model Across Different Health Management Tasks</title><p>The manually assigned scores for the Qwen-72B model were aggregated and analyzed from the perspective of health management tasks (<xref ref-type="fig" rid="figure6">Figure 6</xref>). On the basis of the evaluation results of multiple experts, the clarity and coherence of the agent&#x2019;s responses were slightly inferior to those of the LLM, which was consistent with the previous findings. This is attributed to the agent&#x2019;s replies being more specialized, covering a broader range of topics, and having a higher reading difficulty level. Overall, across the 3 health management task scenarios, the agent demonstrated superior application performance compared to the LLM, with higher accuracy and completeness in its responses. Correspondingly, the increased reading difficulty level placed greater demands on patients&#x2019; comprehension abilities. In medication guidance and clinical consultation tasks, the aggregate accuracy score of the agent exceeded that for lifestyle intervention tasks, indicating that the agent could more precisely understand questions with relatively fixed answers and generate targeted responses. However, the completeness of the agent&#x2019;s responses for medication guidance tasks was slightly inferior to that for lifestyle intervention and clinical consultation tasks. This suggests that the knowledge base and search engine tools provided to the agent need to be further enhanced in terms of drug-related knowledge to improve its capability in medication guidance.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Comparison of different health management tasks. On the basis of the statistical results of human evaluation, the mean score assigned by all experts to all questions under a specific category of health management tasks was calculated. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73857_fig06.png"/></fig></sec></sec><sec id="s3-3"><title>Comprehensive Evaluation of Q&#x0026;A Models</title><p>The use of AI tools in medical scenarios carries unique considerations as medical assistance tools must meet stringent quality assurance standards. <xref ref-type="fig" rid="figure6">Figure 6</xref> and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> show the expert ratings for the dimensions of health management and model characteristics, respectively. As the parameter size of the foundational model increased, expert ratings also gradually increased. However, agents based on smaller models, due to their limited ability to comprehend complex contexts, exhibited poorer performance compared to LLMs of equivalent size. When the size of the foundational model increased, the superiority of agents over LLMs became significantly more pronounced, whereas the increased reading difficulty level resulting from the enhanced professionalism of the responses also became evident. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> shows the results of the ablation experiment, indicating that the improvements in the agents stemmed from the synergistic effects of RAG and search tools.</p><p>A comparative analysis was conducted between the model with the strongest comprehensive capabilities (agent 72B) and 3 newly released LLMs in 2025: 2 medical LLMs, Baichuan-14B-M1 [<xref ref-type="bibr" rid="ref42">42</xref>] and MedGemma-27B [<xref ref-type="bibr" rid="ref43">43</xref>], and the 32B model from the Qwen3 series [<xref ref-type="bibr" rid="ref44">44</xref>]. This study revealed that Baichuan-14B-M1 outperformed larger-scale models such as Qwen2.5-72B and Llama 3-70B in the medical domain [<xref ref-type="bibr" rid="ref42">42</xref>]. Meanwhile, MedGemma-27B surpassed models such as BioMistral-7B-DARE and OpenBioLLM-70B [<xref ref-type="bibr" rid="ref43">43</xref>]. As shown in <xref ref-type="table" rid="table3">Table 3</xref>, the BERTScore and embedding IQRs computed over 63 questions indicate that the agent 72B achieved the highest performance, surpassing both the latest general-domain LLMs and medical-domain LLMs on chronic gastritis health management tasks.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of state-of-the-art models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">BERTScore, median (IQR)</td><td align="left" valign="bottom">Embedding average score, median (IQR)</td></tr></thead><tbody><tr><td align="left" valign="top">Baichuan-14B-M1</td><td align="char" char="." valign="top">43.142 (37.151-48.233)</td><td align="char" char="." valign="top">86.024 (80.914-89.266)</td></tr><tr><td align="left" valign="top">MedGemma-27B</td><td align="char" char="." valign="top">40.448 (35.130-47.228)</td><td align="char" char="." valign="top">86.887 (82.274-89.869)</td></tr><tr><td align="left" valign="top">Qwen3-32B</td><td align="char" char="." valign="top">40.650 (33.372-47.909)</td><td align="char" char="." valign="top">85.626 (81.589-89.156)</td></tr><tr><td align="char" char="hyphen" valign="top">Agent 72B</td><td align="char" char="." valign="top">44.884 (38.934-52.909)</td><td align="char" char="." valign="top">88.574 (84.211-91.298)</td></tr></tbody></table></table-wrap></sec><sec id="s3-4"><title>Safety Analysis</title><p>In a survey of 11,004 adults, 6602 (60.0%) reported feeling uneasy about health care professionals using AI-assisted tools, underscoring the critical need to establish patient trust [<xref ref-type="bibr" rid="ref45">45</xref>]. Ensuring safety is key to achieving this goal. Using the Likert scale mentioned previously, experts evaluated the safety (harmful outputs such as the fabrication of false information, dissemination of erroneous data, presence of biases, associated risks, alterations, and plagiarism) of the models&#x2019; outputs [<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref48">48</xref>]. As shown in <xref ref-type="table" rid="table4">Table 4</xref>, base models with a size larger than 7B demonstrated sufficient capability to generate relatively safe responses for patients. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> presents a representative error case generated by agent 7B accompanied by expert commentary.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Safety assessment results.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Safety score (1-5; SD; 95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> 1.5B</td><td align="left" valign="top">4.85 (0.62; 4.79-4.92)</td></tr><tr><td align="left" valign="top">Agent 1.5B</td><td align="left" valign="top">4.42 (1.06; 4.31-4.54)</td></tr><tr><td align="left" valign="top">LLM 7B</td><td align="left" valign="top">4.98 (0.13; 4.97-5.00)</td></tr><tr><td align="left" valign="top">Agent 7B</td><td align="left" valign="top">4.95 (0.27; 4.92-4.98)</td></tr><tr><td align="left" valign="top">LLM 72B</td><td align="left" valign="top">4.95 (0.21; 4.93-4.98)</td></tr><tr><td align="left" valign="top">Agent 72B</td><td align="left" valign="top">4.98 (0.15; 4.96-4.99)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Robustness Analysis</title><p>The stability of the responses generated by the Q&#x0026;A model was also a critical factor in determining its suitability for clinical applications. To further analyze the robustness of the agent, we preserved the original parameters and conducted supplementary evaluations based on the preconfigured agent. We randomly selected 1 patient question from each difficulty level and each type of health management task, resulting in a total of 9 questions. Each of these 9 questions was input into the agent 30 times, yielding 270 (9 &#x00D7; 30) outputs. For each output, we calculated the embedding average score, BERTScore, and the corresponding mean absolute deviation (MAD). The results showed that the MAD for the embedding average score was 0.0167 and the MAD for the BERTScore was 0.0387. These findings indicate that the agent based on the Qwen-72B model performed with high stability, exhibiting minimal random fluctuations and demonstrating strong robustness.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The evidence from this study demonstrates that LLM-based agents possess considerable potential in the management of chronic gastritis. The responses generated by agents were generally superior to those produced by LLMs, effectively addressing the limitations of LLMs in handling high-complexity questions. Moreover, agents exhibited higher safety and stability and were capable of outperforming LLMs in the cutting-edge medical domain.</p><p>This study presents a comprehensive evaluation of LLM-based agents&#x2019; effectiveness in chronic gastritis management across heterogeneous clinical scenarios and different scales. Our multimetric analysis revealed 4 critical findings that advance the understanding of AI-driven medical decision support systems. The first finding relates to model scaling effects. The quality of responses for chronic gastritis exhibited progressive enhancement with increasing base model sizes. Larger architectures (eg, Qwen-72B) demonstrated superior medical information processing capabilities through improved semantic comprehension and clinical reasoning. Conversely, smaller models showed inherent limitations in effectively integrating medical knowledge derived from RAG and search tools, resulting in suboptimal domain-specific performance. This parameter-performance correlation aligns with neural scaling laws while highlighting critical capacity thresholds for medical AI applications. The second finding relates to agent versus base LLM performance. When using base models with larger parameters (eg, Qwen-72B), the agents demonstrated better performance compared to the LLMs, particularly in terms of answer accuracy and completeness. However, due to the agents&#x2019; stronger medical expertise in generating responses, as well as their higher lexical and syntactic complexity, the readability of their answers tended to be lower than that of the LLMs&#x2019; answers. The third finding relates to the impact of problem complexity. As the difficulty of the questions increased, the response quality of both Q&#x0026;A models showed a declining trend. However, overall, the agents outperformed the LLMs across all 3 difficulty levels, particularly in handling high-difficulty questions, where the agents significantly compensated for the LLMs&#x2019; shortcomings in the medical domain. The fourth finding relates to performance on different tasks. When addressing different health management task scenarios, the agents demonstrated statistically significant superiority over the LLMs in both accuracy and completeness of responses, with the most notable improvement observed in clinical consultation scenarios. Lifestyle intervention questions were subjective and did not have a single standard answer, resulting in limited improvement from RAG and search tools. Drug information is complex and varied, as medications produced by different manufacturers may have differences in use details, and the current information is insufficient to fully cover all potential patient inquiries. Despite these challenges, the agents still exhibited higher practical value in chronic gastritis management compared to the LLMs.</p><p>This study has both theoretical and practical contributions. On the theoretical side, first, this study demonstrated that LLM-based agents outperform generalized LLMs in multiple scenarios of chronic gastritis management. Second, this study demonstrated that the larger the parameters of either the base model or the LLM-based agents, the better the performance of the model, even though the quality of the answers tended to decrease as the difficulty of the questions increased. Finally, this study demonstrated that the LLM-based agents had better performance in multiple scenarios of chronic gastritis management. On the practical side, this study identified the value of LLM-based agents in chronic disease management. By using real-world problems and multiscenario chronic disease management tasks, we validated the capabilities of these agents. This provides a reliable LLM-driven approach to the management of chronic gastritis, paves the way for future LLM-based chronic disease management, and provides a more flexible form of counseling for patients.</p><p>There are some limitations to our study. First, we only used textual data, whereas data in the clinical setting also include modalities such as temporal, image, and video data. Future research could use multimodal data to evaluate the performance of LLMs in chronic disease management. Second, due to the privacy management requirements of health care data, we only used privately deployed models, which generally have a small number of participants. Future research can further explore models with a larger number of parameters, such as the GPT-4 family of models. Third, we did not use medical bigram models such as HuatuoGPT [<xref ref-type="bibr" rid="ref49">49</xref>]. This is because these models have been fine-tuned using the medical corpus, and the unknown corpus may affect our evaluation and cause cognitive bias in specific scenarios. Fourth, the hallucination issue in LLMs cannot be overlooked. While agents can reduce the probability of hallucinations, to prevent potential adverse impacts on health care applications, we recommend incorporating a hallucination threshold control mechanism in future studies. This system would automatically suspend operations and initiate retraining when the false positive rate exceeds predefined safety thresholds. It should be noted that these findings are based solely on Chinese-language data. Future research could validate these results across different linguistic and cultural contexts. In addition, patient surveys and randomized controlled trials could be conducted to investigate factors influencing patients&#x2019; use of AI tools in real clinical settings. Such studies would provide further guidance for enhancing the effectiveness of chronic gastritis self-management tools.</p></sec><sec id="s4-2"><title>Conclusions</title><p>In this study, we compared the effectiveness of agents and LLMs on a chronic disease Q&#x0026;A dataset across different levels of difficulty and various scenarios. Our multiperspective evaluation results show that the responses generated by the agents were often preferred over those of the LLMs due to their higher embedding average score, BERTScore, accuracy, and completeness values, as well as their higher values in other metrics. The LLM-based agents demonstrated advantages across different difficulty levels, particularly addressing the shortcomings of LLMs in handling high-difficulty questions. Furthermore, the LLM-based agents exhibited varying application effectiveness in different health management task scenarios, proving more suitable for questions with relatively fixed answers. Compared with state-of-the-art general-purpose models and medical-domain large models, the 72B agent further demonstrates its professional competence in the health management of chronic gastritis. The robustness and safety analyses we conducted explored the stability of the agents&#x2019; responses and their safety for clinical application. The results of this study suggest that LLM-based agents have high value for application in the management of chronic gastritis and that they are effective in guiding patients with chronic diseases in solving common problems, thereby potentially reducing clinicians&#x2019; workload and improving the quality of patients&#x2019; home care.</p></sec></sec></body><back><ack><p>This study was funded by the China Computer Federation (CCF)-BaiChuan-Ebtech Foundation Model Fund (2023012) and Chinese Information Processing Society of China (CIPSC)-Social Media Processing (SMP)-Zhipu Large Model Cross-Disciplinary Fund (ZPCG20241107362).</p></ack><notes><sec><title>Data Availability</title><p>The core code and data for this research can be found on the corresponding author&#x2019;s GitHub page [<ext-link ext-link-type="uri" xlink:href="https://github.com/472404420/JMI-codes">50</ext-link>].</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">MAD</term><def><p>mean absolute deviation</p></def></def-item><def-item><term id="abb4">Q&#x0026;A</term><def><p>question and answer</p></def></def-item><def-item><term id="abb5">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Correa</surname><given-names>P</given-names> </name></person-group><article-title>Human gastric carcinogenesis: a multistep and multifactorial process--first American cancer society award lecture on cancer epidemiology and prevention</article-title><source>Cancer Res</source><year>1992</year><month>12</month><day>15</day><volume>52</volume><issue>24</issue><fpage>6735</fpage><lpage>6740</lpage><pub-id pub-id-type="medline">1458460</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sung</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ferlay</surname><given-names>J</given-names> </name><name name-style="western"><surname>Siegel</surname><given-names>RL</given-names> </name><etal/></person-group><article-title>Global cancer statistics 2020: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title><source>CA Cancer J Clin</source><year>2021</year><month>05</month><volume>71</volume><issue>3</issue><fpage>209</fpage><lpage>249</lpage><pub-id pub-id-type="doi">10.3322/caac.21660</pub-id><pub-id pub-id-type="medline">33538338</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Volatile organic metabolites identify patients with gastric carcinoma, gastric ulcer, or gastritis and control patients</article-title><source>Cancer Cell Int</source><year>2017</year><volume>17</volume><issue>1</issue><fpage>108</fpage><pub-id pub-id-type="doi">10.1186/s12935-017-0475-x</pub-id><pub-id pub-id-type="medline">29200968</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Health related quality of life in patients with chronic gastritis and peptic ulcer and factors with impact: a longitudinal study</article-title><source>BMC Gastroenterol</source><year>2014</year><month>08</month><day>20</day><volume>14</volume><issue>1</issue><fpage>149</fpage><pub-id pub-id-type="doi">10.1186/1471-230X-14-149</pub-id><pub-id pub-id-type="medline">25141760</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kong</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The status and risk factors for anxiety/depression in patients with atrophic chronic gastritis: a cross-sectional study</article-title><source>Ann Palliat Med</source><year>2022</year><month>10</month><volume>11</volume><issue>10</issue><fpage>3147</fpage><lpage>3159</lpage><pub-id pub-id-type="doi">10.21037/apm-22-730</pub-id><pub-id pub-id-type="medline">36096741</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Osadolor</surname><given-names>OO</given-names> </name><name name-style="western"><surname>Osadolor</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Osadolor</surname><given-names>OO</given-names> </name><name name-style="western"><surname>Enabulele</surname><given-names>E</given-names> </name><name name-style="western"><surname>Akaji</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Odiowaya</surname><given-names>DE</given-names> </name></person-group><article-title>Access to health services and health inequalities in remote and rural areas</article-title><source>Janaki Med Coll J Med Sci</source><year>2022</year><month>08</month><day>28</day><volume>10</volume><issue>2</issue><fpage>70</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.3126/jmcjms.v10i2.47868</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>D</given-names> </name><name name-style="western"><surname>He</surname><given-names>S</given-names> </name><name name-style="western"><surname>Webster</surname><given-names>C</given-names> </name></person-group><article-title>Examining trans-provincial diagnosis of rare diseases in China: the importance of healthcare resource distribution and patient mobility</article-title><source>Sustainability</source><year>2020</year><month>01</month><volume>12</volume><issue>13</issue><fpage>5444</fpage><pub-id pub-id-type="doi">10.3390/su12135444</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>White</surname><given-names>RW</given-names> </name><name name-style="western"><surname>Horvitz</surname><given-names>E</given-names> </name></person-group><article-title>Cyberchondria: studies of the escalation of medical concerns in web search</article-title><source>ACM Trans Inf Syst</source><year>2009</year><month>11</month><day>30</day><volume>27</volume><issue>4</issue><fpage>1</fpage><lpage>37</lpage><pub-id pub-id-type="doi">10.1145/1629096.1629101</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simon</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Evans</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Benjamin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Delano</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bates</surname><given-names>DW</given-names> </name></person-group><article-title>Patients&#x2019; attitudes toward electronic health information exchange: qualitative study</article-title><source>J Med Internet Res</source><year>2009</year><month>08</month><day>6</day><volume>11</volume><issue>3</issue><fpage>e30</fpage><pub-id pub-id-type="doi">10.2196/jmir.1164</pub-id><pub-id pub-id-type="medline">19674960</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abdelhamid</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gaia</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sanders</surname><given-names>GL</given-names> </name></person-group><article-title>Putting the focus back on the patient: how privacy concerns affect personal health information sharing intentions</article-title><source>J Med Internet Res</source><year>2017</year><month>09</month><day>13</day><volume>19</volume><issue>9</issue><fpage>e169</fpage><pub-id pub-id-type="doi">10.2196/jmir.6877</pub-id><pub-id pub-id-type="medline">28903895</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Narasimhan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salimans</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Improving language understanding by generative pre-training</article-title><year>2018</year><access-date>2025-10-09</access-date><publisher-name>OpenAI</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf">https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sridharan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sivaramakrishnan</surname><given-names>G</given-names> </name></person-group><article-title>Investigating the capabilities of advanced large language models in generating patient instructions and patient educational material</article-title><source>Eur J Hosp Pharm</source><year>2025</year><month>10</month><day>24</day><volume>32</volume><issue>6</issue><fpage>501</fpage><lpage>507</lpage><pub-id pub-id-type="doi">10.1136/ejhpharm-2024-004245</pub-id><pub-id pub-id-type="medline">39393839</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Poliak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>589</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id><pub-id pub-id-type="medline">37115527</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wooldridge</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jennings</surname><given-names>NR</given-names> </name></person-group><article-title>Intelligent agents: theory and practice</article-title><source>Knowl Eng Rev</source><year>1995</year><month>06</month><volume>10</volume><issue>2</issue><fpage>115</fpage><lpage>152</lpage><pub-id pub-id-type="doi">10.1017/S0269888900008122</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gosmar</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dahl</surname><given-names>DA</given-names> </name></person-group><article-title>Hallucination mitigation using agentic AI natural language-based frameworks</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 19, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.13946</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Duan</surname><given-names>Z</given-names> </name></person-group><article-title>Controlling large language model hallucination based on agent ai with langgraph</article-title><source>Cambridge Open Engage</source><comment>Preprint posted online on  Jan 13, 2025</comment><pub-id pub-id-type="doi">10.33774/coe-2025-xkwl5</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wallat</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jatowt</surname><given-names>A</given-names> </name><name name-style="western"><surname>Anand</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ang&#x00E9;lica</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lattanzi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Medina</surname><given-names>AM</given-names> </name></person-group><article-title>Temporal blind spots in large language models</article-title><source>WSDM &#x2019;24: Proceedings of the 17th ACM International Conference on Web Search and Data Mining</source><year>2024</year><publisher-name>Association for Computing Machinery</publisher-name><pub-id pub-id-type="doi">10.1145/3616855.3635818</pub-id><pub-id pub-id-type="other">9798400703713</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>JS</given-names> </name><name name-style="western"><surname>O&#x2019;Brien</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Morris</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bernstein</surname><given-names>MS</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Riche</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Han</surname><given-names>J</given-names> </name><name name-style="western"><surname>Steimle</surname><given-names>J</given-names> </name><name name-style="western"><surname>Riche</surname><given-names>NH</given-names> </name></person-group><article-title>Generative agents: interactive simulacra of human behavior</article-title><source>UIST &#x2019;23: Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology</source><year>2023</year><publisher-name>Association for Computing Machinery</publisher-name><pub-id pub-id-type="doi">10.1145/3586183.3606763</pub-id><pub-id pub-id-type="other">9798400701320</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Jia</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>AM</given-names> </name><etal/></person-group><article-title>Training socially aligned language models on simulated social interactions</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 28, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.16960</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sumers</surname><given-names>TR</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Narasimhan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Griffiths</surname><given-names>TL</given-names> </name></person-group><article-title>Cognitive architectures for language agents</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.02427</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhuang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>EHRAgent: code empowers large language models for few-shot complex tabular reasoning on electronic health records</article-title><conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 12-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-main.1245</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Agent-pro: learning to evolve via policy-level reflection and optimization</article-title><conf-name>Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Aug 11-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.292</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hao</surname><given-names>X</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Chen</surname><given-names>YN</given-names></name><name name-style="western"><surname>Bansal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YN</given-names></name></person-group><article-title>LONGAGENT: achieving question answering for 128k-token-long documents through multi-agent collaboration</article-title><source>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</source><year>2024</year><publisher-name>Association for Computational Linguistics</publisher-name><pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-main.912</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bartolo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Moore</surname><given-names>A</given-names> </name><name name-style="western"><surname>Riedel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Stenetorp</surname><given-names>P</given-names> </name></person-group><article-title>Fantastically ordered prompts and where to find them: overcoming few-shot prompt order sensitivity</article-title><conf-name>Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>May 22-27, 2022</conf-date><pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.556</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YP</given-names> </name><name name-style="western"><surname>Nakayama</surname><given-names>H</given-names> </name></person-group><article-title>A better LLM evaluator for text generation: the impact of prompt output sequencing and optimization</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 14, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.09972</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>LK</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>L</given-names> </name></person-group><article-title>Retrieval augmented generation (RAG) and beyond: a comprehensive survey on how to make your llms use external data more wisely</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.14924</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Papageorgiou</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chronis</surname><given-names>C</given-names> </name><name name-style="western"><surname>Varlamis</surname><given-names>I</given-names> </name><name name-style="western"><surname>Himeur</surname><given-names>Y</given-names> </name></person-group><article-title>A survey on the use of large language models (LLMs) in fake news</article-title><source>Future Internet</source><year>2024</year><volume>16</volume><issue>8</issue><fpage>298</fpage><pub-id pub-id-type="doi">10.3390/fi16080298</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hui</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>B</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen2.5 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 3, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.15115</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hui</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>B</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Qwen2 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 10, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.10671</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Perez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Piktus</surname><given-names>A</given-names> </name><name name-style="western"><surname>Petroni</surname><given-names>F</given-names> </name><name name-style="western"><surname>Karpukhin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 12, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.11401</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ng</surname><given-names>KKY</given-names> </name><name name-style="western"><surname>Matsuba</surname><given-names>I</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>PC</given-names> </name></person-group><article-title>RAG in health care: a novel framework for improving communication and decision-making by addressing LLM limitations</article-title><source>NEJM AI</source><year>2025</year><month>01</month><volume>2</volume><issue>1</issue><fpage>AIra2400380</fpage><pub-id pub-id-type="doi">10.1056/AIra2400380</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Agirre</surname><given-names>E</given-names> </name><name name-style="western"><surname>Alfonseca</surname><given-names>E</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kravalova</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pa&#x015F;ca</surname><given-names>M</given-names> </name><name name-style="western"><surname>Soroa</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ostendorf</surname><given-names>M</given-names> </name></person-group><article-title>A study on similarity and relatedness using distributional and wordnet-based approaches</article-title><source>NAACL &#x2019;09: Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics</source><year>2009</year><publisher-name>Association for Computational Linguistics</publisher-name><pub-id pub-id-type="doi">10.3115/1620754.1620758</pub-id><pub-id pub-id-type="other">9781932432411</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><name name-style="western"><surname>Landi</surname><given-names>I</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>G</given-names> </name><name name-style="western"><surname>Nabeel</surname><given-names>I</given-names> </name></person-group><article-title>Using fine-tuned large language models to parse clinical notes in musculoskeletal pain disorders</article-title><source>Lancet Digit Health</source><year>2023</year><month>10</month><day>26</day><volume>5</volume><issue>12</issue><fpage>e855</fpage><lpage>e858</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00202-9</pub-id><pub-id pub-id-type="medline">39492289</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Van Veen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Uden</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blankemeier</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Adapted large language models can outperform medical experts in clinical text summarization</article-title><source>Nat Med</source><year>2024</year><month>04</month><volume>30</volume><issue>4</issue><fpage>1134</fpage><lpage>1142</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-02855-5</pub-id><pub-id pub-id-type="medline">38413730</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tam</surname><given-names>TYC</given-names> </name><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A framework for human evaluation of large language models in healthcare derived from literature review</article-title><source>NPJ Digit Med</source><year>2024</year><month>09</month><day>28</day><volume>7</volume><issue>1</issue><fpage>258</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01258-7</pub-id><pub-id pub-id-type="medline">39333376</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lei</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>K</given-names> </name></person-group><article-title>AlphaReadabilityChinese: a tool for the measurement of readability in Chinese texts and its applications</article-title><source>Foreign Lang Teach</source><year>2024</year><volume>2024</volume><issue>1</issue><fpage>83</fpage><lpage>93</lpage><pub-id pub-id-type="doi">10.13458/j.cnki.flatt.004997</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>BW</given-names> </name><name name-style="western"><surname>Jang</surname><given-names>YS</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JJ</given-names> </name></person-group><article-title>Pushing on text readability assessment: a transformer meets handcrafted linguistic features</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 16, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2109.12258</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cascella</surname><given-names>M</given-names> </name><name name-style="western"><surname>Montomoli</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bellini</surname><given-names>V</given-names> </name><name name-style="western"><surname>Bignami</surname><given-names>E</given-names> </name></person-group><article-title>Evaluating the feasibility of ChatGPT in healthcare: an analysis of multiple clinical and research scenarios</article-title><source>J Med Syst</source><year>2023</year><month>03</month><day>4</day><volume>47</volume><issue>1</issue><fpage>33</fpage><pub-id pub-id-type="doi">10.1007/s10916-023-01925-4</pub-id><pub-id pub-id-type="medline">36869927</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Shi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Misra</surname><given-names>K</given-names> </name><name name-style="western"><surname>Scales</surname><given-names>N</given-names> </name><name name-style="western"><surname>Dohan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chi</surname><given-names>EH</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Krause</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brunskill</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>K</given-names> </name><name name-style="western"><surname>Engelhardt</surname><given-names>B</given-names> </name><name name-style="western"><surname>Sabato</surname><given-names>S</given-names> </name></person-group><article-title>Large language models can be easily distracted by irrelevant context</article-title><source>ICML&#x2019;23: Proceedings of the 40th International Conference on Machine Learning</source><year>2023</year><access-date>2025-05-25</access-date><publisher-name>JMLR.org</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3618408.3619699">https://dl.acm.org/doi/10.5555/3618408.3619699</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Song</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Baichuan-M1: pushing the medical capability of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 5, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.12671</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sellergren</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kazemzadeh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jaroensri</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kiraly</surname><given-names>A</given-names> </name><name name-style="western"><surname>Traverse</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kohlberger</surname><given-names>T</given-names> </name><etal/></person-group><article-title>MedGemma technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 12, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2507.05201</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hui</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  May 14, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.09388</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Tyson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pasquini</surname><given-names>G</given-names> </name><name name-style="western"><surname>Spencer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Funk</surname><given-names>C</given-names> </name></person-group><article-title>60% of Americans would be uncomfortable with provider relying on AI in their own health care</article-title><source>Pew Research Center</source><year>2025</year><access-date>2025-07-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.pewresearch.org/science/2023/02/22/60-of-americans-would-be-uncomfortable-with-provider-relying-on-ai-in-their-own-health-care/">https://www.pewresearch.org/science/2023/02/22/60-of-americans-would-be-uncomfortable-with-provider-relying-on-ai-in-their-own-health-care/</ext-link></comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey on evaluation of large language models</article-title><source>ACM Trans Intell Syst Technol</source><year>2024</year><month>06</month><day>30</day><volume>15</volume><issue>3</issue><fpage>1</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1145/3641289</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bian</surname><given-names>C</given-names> </name><etal/></person-group><article-title>BeaverTails: towards improved safety alignment of LLM via a human-preference dataset</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 7, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.04657</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>JCL</given-names> </name><etal/></person-group><article-title>A proposed s.c.o.r.e. evaluation framework for large language models &#x2013; safety, consensus &#x0026; context, objectivity, reproducibility and explainability</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 10, 2024</comment><pub-id pub-id-type="doi">10.2139/ssrn.5029562</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>F</given-names> </name><etal/></person-group><article-title>HuatuoGPT, towards taming language model to be a doctor</article-title><conf-name>Findings of the Association for Computational Linguistics: EMNLP 2023</conf-name><conf-date>Dec 6-10, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.725</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="web"><article-title>JMI-codes</article-title><source>GitHub</source><access-date>2025-10-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/472404420/JMI-codes">https://github.com/472404420/JMI-codes</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary tables and figures.</p><media xlink:href="medinform_v13i1e73857_app1.docx" xlink:title="DOCX File, 517 KB"/></supplementary-material></app-group></back></article>