<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e66429</article-id><article-id pub-id-type="doi">10.2196/66429</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Models as a Consulting Hotline for Patients With Breast Cancer and Specialists in China: Cross-Sectional Questionnaire Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Liu</surname><given-names>Hui</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Peng</surname><given-names>Jialun</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Lu</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Deng</surname><given-names>Ao</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Huang</surname><given-names>XiangXin</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yin</surname><given-names>Guobing</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Luo</surname><given-names>Haojun</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Thyroid and Breast Surgery, The Second Affiliated Hospital of Chongqing Medical University</institution><addr-line>74 Linjiang Road</addr-line><addr-line>Chongqing</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Hepatobiliary Surgery, The Second Affiliated Hospital of Chongqing Medical University</institution><addr-line>Chongqing</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of Thyroid and Breast Surgery, Renji Hospital Affiliated of Chongqing University</institution><addr-line>Chongqing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Castonguay</surname><given-names>Alexandre</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Bhasuran</surname><given-names>Balu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Haojun Luo, PhD, Department of Thyroid and Breast Surgery, The Second Affiliated Hospital of Chongqing Medical University, 74 Linjiang Road, Chongqing, 400010, China, 86 13452999485; <email>303505@hospital.cqmu.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>27</day><month>5</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e66429</elocation-id><history><date date-type="received"><day>12</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>21</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>24</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Hui Liu, Jialun Peng, Lu Li, Ao Deng, XiangXin Huang, Guobing Yin, Haojun Luo. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 27.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e66429"/><abstract><sec><title>Background</title><p>The disease burden of breast cancer is increasing in China. Guiding people to obtain accurate information on breast cancer and improving the public&#x2019;s health literacy are crucial for the early detection and timely treatment of breast cancer. Large language model (LLM) is a currently popular source of health information. However, the accuracy and practicality of the breast cancer&#x2013;related information provided by LLMs have not yet been evaluated.</p></sec><sec><title>Objective</title><p>This study aims to evaluate and compare the accuracy, practicality, and generalization-specificity of responses to breast cancer&#x2013;related questions from two LLMs, ChatGPT and ERNIE Bot (EB).</p></sec><sec sec-type="methods"><title>Methods</title><p>The questions asked to the LLMs consisted of a patient questionnaire and an expert questionnaire, each containing 15 questions. ChatGPT was queried in both Chinese and English, recorded as ChatGPT-Chinese (ChatGPT-C) and ChatGPT-English (ChatGPT-E) respectively, while EB was queried in Chinese. The accuracy, practicality, and generalization-specificity of each inquiry&#x2019;s responses were rated by a breast cancer multidisciplinary treatment team using Likert scales.</p></sec><sec sec-type="results"><title>Results</title><p>Overall, for both the patient and expert questionnaire, the accuracy and practicality of responses from ChatGPT-E were significantly higher than those from ChatGPT-C and EB (all <italic>Ps</italic>&#x003C;.001). However, the responses from all LLMs are relatively generalized, leading to lower accuracy and practicality for the expert questionnaire compared to the patient questionnaire. Additionally, there were issues such as the lack of supporting evidence and potential ethical risks in the responses of LLMs.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Currently, compared to other LLMs, ChatGPT-E has demonstrated greater potential for application in educating Chinese patients with breast cancer, and may serve as an effective tool for them to obtain health information. However, for breast cancer specialists, these LLMs are not yet suitable for assisting in clinical diagnosis or treatment activities. Additionally, data security, ethical, and legal risks associated with using LLMs in clinical practice cannot be ignored. In the future, further research is needed to determine the true efficacy of LLMs in clinical scenarios related to breast cancer in China.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>breast cancer</kwd><kwd>health education</kwd><kwd>cross-sectional study</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Breast cancer has become the most common malignant tumor globally, with an estimated 11.7% of all new cancer cases in 2020 [<xref ref-type="bibr" rid="ref1">1</xref>]. The incidence of breast cancer has been rising in China, with 420,000 Chinese women diagnosed in 2020, accounting for 18% of global cases [<xref ref-type="bibr" rid="ref2">2</xref>]. Breast cancer also contributes significantly to cancer-related deaths; however, early detection and timely treatment play a significant role in reducing its mortality rate [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Providing health education through appropriate channels and disseminating accurate medical health information to the public can help improve public awareness of breast cancer, thereby alleviating the burden of breast cancer in China. Currently, the internet is the primary source for people to obtain health information. Recent studies showed that 55% of Europeans aged 16&#x2010;74 years seek health-related information online, while in mainland China, nearly 79% of the population searches for health information on the internet [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. However, the quality of online health information varies considerably, and inaccurate or even erroneous health information may lead to patients making inappropriate medical decisions, posing a threat to public health [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. Large language model (LLM) is a type of chatbot that combines artificial intelligence with natural language processing, are trained on massive text data [<xref ref-type="bibr" rid="ref11">11</xref>]. ChatGPT, developed by OpenAI, has garnered global attention since its release and been applied across multiple fields. ERNIE Bot (EB; Chinese name: Wenxin-Yiyan), developed by Baidu, benefits from Baidu&#x2019;s strong influence in artificial intelligence and has achieved significant popularity and a user base in the Chinese market. These tools have recently become widely popular and demonstrated significant potential in the medical field [<xref ref-type="bibr" rid="ref12">12</xref>]. Studies have shown that ChatGPT has greater potential for patient education in breast reconstruction and diabetes self-management, while also being able to accurately answer some cancer-related questions [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. However, some scholars have questioned the accuracy and practicality of the medical health information provided by ChatGPT [<xref ref-type="bibr" rid="ref12">12</xref>]. Currently, there is a lack of studies evaluating the educational potential of ChatGPT and EB&#x2014;two of the most commonly used LLMs in China&#x2014;among Chinese patients with breast cancer and their utility for Chinese breast cancer physicians. To address this gap, this study assesses whether these LLMs can serve as educational tools for Chinese patients with breast cancer and clinical assistance tools for Chinese breast cancer specialists by comparing the accuracy and reliability of responses to breast cancer&#x2013;related questions between ChatGPT and EB.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Questionnaire Design and Data Collection</title><p>The questions asked to LLMs consisted of a patient questionnaire and an expert questionnaire, each containing 15 questions and covering aspects such as the diagnosis, treatment, prognosis, and follow-up of breast cancer. The patient questionnaire was derived by distributing a questionnaire to patients with breast cancer to investigate their most important concerns (<xref ref-type="other" rid="box1">Textbox 1</xref>). The expert questionnaire was summarized by two experienced breast surgeons, based on the National Comprehensive Cancer Network (NCCN) Clinical Practice Guidelines in Oncology for breast cancer and the International Consensus Guidelines for advanced breast cancer (<xref ref-type="other" rid="box2">Textbox 2</xref>) [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. On January 15, 2024, all questions were input into ChatGPT (version 4.0) and EB (version 4.0). Each input was independently entered using the &#x201C;new chat&#x201D; function and inputted twice to detect its repeatability. To optimize the responses of the LLMs, prompt engineering was applied with the same lead-in statement: &#x201C;Now that you are a breast cancer specialist, please answer the following questions,&#x201D; which was input into the LLMs along with each question. As ChatGPT was developed in the United States, we queried ChatGPT in both Chinese and English, denoted as ChatGPT-Chinese (ChatGPT-C) and ChatGPT-English (ChatGPT-E), respectively, and the EB, developed in China, was queried in Chinese only. The responses from ChatGPT and EB were recorded using Microsoft Excel (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><boxed-text id="box1"><title> Specific contents of patient questionnaire.</title><p><bold>Patient Questionnaire</bold></p><list list-type="order"><list-item><p>Is breast cancer hereditary, and will it have an impact on my descendants?</p></list-item><list-item><p>What impact does the staging of breast cancer have on treatment and prognosis?</p></list-item><list-item><p>What are the treatment methods for breast cancer, and which one should I choose?</p></list-item><list-item><p>What are the various surgical treatment methods for breast cancer, and how do they each impact the appearance of the breast?</p></list-item><list-item><p>What is the total cost of treating breast cancer in China?</p></list-item><list-item><p>What aspects are included in the postoperative rehabilitation training for breast cancer, and what benefits does it bring to rehabilitation</p></list-item><list-item><p>Is breastfeeding possible during breast cancer treatment?</p></list-item><list-item><p>Why do I need systemic treatment (such as chemotherapy, endocrine therapy, or targeted therapy) after breast cancer surgery?</p></list-item><list-item><p>What are the adverse reactions of drugs used in breast cancer treatment?</p></list-item><list-item><p>How do I manage psychological and emotional health issues during the treatment of breast cancer?</p></list-item><list-item><p>What lifestyles contribute to the recovery of breast cancer patients?</p></list-item><list-item><p>What daily care is required for a subcutaneously implanted infusion port?</p></list-item><list-item><p>Can breast cancer patients have normal fertility after discharge?</p></list-item><list-item><p>What is the risk of recurrence and the corresponding monitoring methods after breast cancer treatment?</p></list-item><list-item><p>If a breast cancer patient has other chronic illnesses or new health issues that need to be addressed, how should these issues be coordinated with the treatment of breast cancer?</p></list-item></list></boxed-text><boxed-text id="box2"><title> Specific contents of expert questionnaire.</title><p><bold>Expert Questionnaire</bold></p><list list-type="order"><list-item><p>What are the screening methods for breast cancer?</p></list-item><list-item><p>What imaging and biomarkers will you use to assist in the preoperative diagnosis of breast cancer?</p></list-item><list-item><p>What are the requirements for the surgical margins in breast-conserving surgery for ductal carcinoma in situ?</p></list-item><list-item><p>For cN1 breast cancer patients who have converted to cN0 after neoadjuvant therapy, what are the requirements for sentinel lymph node biopsy at this stage?</p></list-item><list-item><p>What surgical methods do you know for stage I breast reconstruction?</p></list-item><list-item><p>What is the strategy for adjuvant chemotherapy in early-stage triple-negative breast cancer?</p></list-item><list-item><p>For early-stage high-risk breast cancer patients with strongly positive hormone receptors, which adjuvant endocrine therapy would you recommend?</p></list-item><list-item><p>What are the different classes of drugs for anti-HER2 therapy?</p></list-item><list-item><p>What is the first-line treatment of choice for stage IV or recurrent metastatic HR-positive/HER2-negative breast cancer?</p></list-item><list-item><p>What are the conditions for exemption from radiotherapy after breast-conserving surgery</p></list-item><list-item><p>What are the common regimens for neoadjuvant therapy in triple-negative breast cancer?</p></list-item><list-item><p>What are your basic principles for the treatment of metastatic breast cancer?</p></list-item><list-item><p>What are your recommendations for the management of bone health in patients during adjuvant endocrine therapy?</p></list-item><list-item><p>For young female breast cancer patients with HR-positive tumors who express a desire for fertility, what considerations do you have in the treatment plan?</p></list-item><list-item><p>How should long-term follow-up and monitoring be conducted for breast cancer patients?</p></list-item></list></boxed-text></sec><sec id="s2-2"><title>Response Assessment</title><p>The breast cancer multidisciplinary treatment team scored the accuracy, practicality, and generalization-specificity of each response using a Likert scale, with the poorer of the two responses being included in the final score if the responses were inconsistent. The team consisted of 13 members, including 7 breast cancer specialists, 2 imaging specialists, 2 pathology specialists, and 2 nursing specialists. The Likert scale is a hierarchical scale, originally developed by Likert and has been used extensively in several research studies [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Accuracy was divided into 6 levels from 1 to 6, with higher scores indicating better accuracy (<xref ref-type="table" rid="table1">Table 1</xref>). Practicality was divided into 4 levels from 1 to 4, with higher scores indicating better practicality (<xref ref-type="table" rid="table2">Table 2</xref>). The generalization-specificity score (GSS) is divided into 5 levels from 1 to 5, with higher scores indicating better specificity (<xref ref-type="table" rid="table3">Table 3</xref>). To reduce bias caused by individual differences in understanding the scoring system, all experts reviewed and discussed the scoring criteria of the Likert scale before the assessment.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Accuracy scoring standard.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Scoring description</td><td align="left" valign="bottom">Scoring</td></tr></thead><tbody><tr><td align="left" valign="bottom">Completely incorrect</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="bottom">More incorrect than correct</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="bottom">Approximately equal correct and incorrect</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="bottom">More correct than incorrect</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="bottom">Nearly all correct</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="bottom">All correct</td><td align="left" valign="top">6</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Practical scoring standard.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Scoring description</td><td align="left" valign="bottom">Scoring</td></tr></thead><tbody><tr><td align="left" valign="top">Completely impractical</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Slightly practical</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">Moderately practical</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Very practical</td><td align="left" valign="top">4</td></tr></tbody></table></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Generalization-Specificity Score (GSS) scoring standard.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Scoring description</td><td align="left" valign="bottom">Scoring</td></tr></thead><tbody><tr><td align="left" valign="top">Fully generalized, with no specific details or targeted information provided</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Primarily generalized but mentions some relevant details or information</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">Combines generalized content with some specific details or information</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Rather specific, but the details or targeted information are insufficient and can be improved</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">Fully specific, with comprehensive details and highly targeted information</td><td align="left" valign="top">5</td></tr></tbody></table></table-wrap></sec><sec id="s2-3"><title>Statistical Analysis</title><p>The Shapiro-Wilk test was used to determine the normality of the scores and the Levene test was used to evaluate the homogeneity of variance. Differences between two groups were assessed using the Wilcoxon rank sum test. The Kruskal-Wallis test evaluated differences between three or more groups of variables, and the Dunn test was used for two-way between-group comparisons of variables that were not normally distributed. <italic>P</italic>&#x003C;.05 was deemed statistically significant. The intraclass correlation coefficient (ICC) was used to evaluate the consistency of accuracy, practicality scores and GSS among 13 raters. An ICC &#x2265;0.75 was considered to indicate good consistency. All statistical analyses were performed using R software (version 4.0.3; R Foundation for Statistical Computing).</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study did not gather patient data and did not involve human subjects. Therefore, approval by the institutional review board of Chongqing Medical University was not required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>In the patient questionnaire, the median accuracy scores of ChatGPT-E, ChatGPT-C, and EB were 5.00 (IQR 5.00-6.00), 5.00 (IQR 5.00-6.00), and 5.00 (IQR 4.00-5.00), respectively. The median practicality scores of ChatGPT-E, ChatGPT-C, and EB were 4.00 (IQR 3.00-4.00), 3 (IQR 3.00-3.00), and 3.00 (IQR 3.00-3.00), respectively; and the median GSS were 4.00 (IQR 3.00-4.00), 3 (IQR 3.00-4.00), and 3.00 (IQR 3.00-4.00), respectively. The accuracy and practicality of ChatGPT-E were significantly higher than those of ChatGPT-C and EB (<italic>P</italic>&#x003C;.001). The accuracy and practicality of ChatGPT-C were also significantly higher than those of EB (<italic>P</italic>=.002 and <italic>P</italic>&#x003C;.001, respectively; <xref ref-type="fig" rid="figure1">Figure 1A and B</xref>). The specificity of the ChatGPT-E responses was significantly higher than that of the ChatGPT-C and EB (<italic>P</italic>=.002 and <italic>P</italic>&#x003C;.001, respectively), whereas no significant difference was found in the specificity of the ChatGPT-C and EB responses (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, parts A and B).</p><p>In the expert questionnaire, the median accuracy scores of ChatGPT-E, ChatGPT-C, and EB were 5.00 (IQR 5.00-5.00), 4.00 (IQR 4.00-5.00), and 4.00 (IQR 4.00-5.00), respectively; the median practicality scores for all three were 3.00 (IQR 3.00-3.00) and the median GSS were 4.00 (IQR 3.00-4.00), 3 (IQR 2.00- 4.00), and 3.00 (IQR 2.00-3.00), respectively. The accuracy and practicality of ChatGPT-E were significantly higher than that of ChatGPT-C and EB (all <italic>P</italic>&#x003C;.001). However, there was no significant difference in the accuracy and practicality scores between ChatGPT-C and EB (<italic>P</italic>=1.000 and <italic>P</italic>=.72, respectively) (<xref ref-type="fig" rid="figure1">Figure 1</xref>). For response generalization and specificity, the ChatGPT-E score was significantly higher than ChatGPT-C and EB (both <italic>P</italic>&#x003C;.001), whereas there was no significant difference between ChatGPT-C and EB. Overall, the median accuracy scores of the patient questionnaire and the expert questionnaire were 5.00 (IQR 5.00-5.00) and 5.00 (IQR 4.00-5.00), respectively; the median practicality scores were 3.00 (IQR 3.00-4.00), and 3.00 (IQR 3.00-3.00), respectively; the median practicality scores were 3.00 (IQR 3.00-4.00), and 3.00 (IQR 3.00-3.00), respectively, and the median GSS was 3.00 (IQR 3.00-4.00) for both questionnaires. The accuracy and practicality scores from the patient questionnaire were significantly higher than those from the expert questionnaire (all <italic>P</italic>&#x003C;.001) (<xref ref-type="fig" rid="figure1">Figure 1</xref>), and their specificity scores were also significantly higher (<italic>P</italic>&#x003C;.001) (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, parts C-E).</p><p>In addition, to quantify the frequency of ratings for accuracy, practicality, and specificity in LLM responses, we illustrated the rating distribution as percentages in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. For accuracy, responses rated as 6 (All correct) accounted for only 11.96% (70/585) in the patient questionnaire, 5.64% (33/585) in the expert questionnaire, and 8.8% (103/1170) overall. Notably, the hallucination rate&#x2014;defined as responses rated &#x2264;4 on the accuracy scale&#x2014;was 19.7% (115/585) in the patient questionnaire and 28.9% (169/585) in the expert questionnaire. For practicality, responses rated as 4 (Very practical) accounted for 26.15% (153/585) in the patient questionnaire, 8.55% (50/585) in the expert questionnaire, and 17.35% (203/1170) overall. For generalization-specificity, responses rated as 5 (Fully specific) accounted for 5.64% (33/585) in the patient questionnaire, 2.39% (14/585) in the expert questionnaire, and 4.02% (47/1170) overall. The 13 raters exhibited excellent inter-rater agreement in their scoring of accuracy, practicality, and GSS, with ICC values of 0.878 (95%Cl 0.837&#x2010;0.912), 0.823 (95%Cl 0.765&#x2010;0.873), and 0.809 (95%Cl 0.758&#x2010;0.855) respectively. Additionally, the statistical descriptive indices for all between-group comparisons are provided in detail in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendices 4</xref><xref ref-type="supplementary-material" rid="app5"/><xref ref-type="supplementary-material" rid="app6"/><xref ref-type="supplementary-material" rid="app7"/>-<xref ref-type="supplementary-material" rid="app8">8</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Scores and comparisons of the overall accuracy and practicality of the LLMs' responses. (A,B): Patient questionnaire; (C,D) Expert questionnaire; (E,F): Comparison of patient and expert questionnaire. Error bars represent mean &#x00B1; standard error.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66429_fig01.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>We have reported several important findings in this study. First, based on the patient questionnaire responses, ChatGPT-E demonstrated significantly higher accuracy compared to ChatGPT-C and EB in addressing questions related to breast cancer surgery treatment (Q4) and postoperative management (Q6, Q11, and Q15) (<xref ref-type="fig" rid="figure2">Figure 2A</xref>). Additionally, ChatGPT-E&#x2019;s responses to questions concerning breast cancer staging (Q2), treatment (Q4 and Q8), and postoperative management (Q6 and 14) were more comprehensive and practical (<xref ref-type="fig" rid="figure2">Figure 2</xref>). In the expert questionnaire, ChatGPT-E demonstrated similar advantages, especially for breast cancer drug treatment (Q8) and follow-up (Q15), with more comprehensive, accurate, and practical responses, reflecting higher efficiency (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Overall, ChatGPT-E performed the best in both patient and expert questionnaires. Despite the advantages in training strategies that may have enabled ChatGPT-C to perform better than EB in answering general questions from patients with breast cancer, the performances of both models were unsatisfactory while answering comparatively specialized questions in the field of breast cancer in the Chinese-language context (<xref ref-type="fig" rid="figure1">Figure 1</xref>). For example, in response to the expert questionnaire Q5, both ChatGPT-C and EB only briefly mentioned several common methods of first-stage breast reconstruction. Only ChatGPT-E mentioned &#x201C;latissimus dorsi flap breast reconstruction&#x201D; and briefly introduced the advantages of each surgical method. Although its response was not detailed enough, the basic framework was correct. Although the overall hallucination rate of ChatGPT-E was significantly lower than that of ChatGPT-C and EB (both <italic>P</italic>&#x003C;.001), 11.79% (23/195) of its responses in the expert questionnaire still contained inaccurate information. This finding indicates that even models with relatively superior performance must further reduce hallucination rates in the specialized field of breast cancer to meet clinical requirements.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Scores and comparisons of LLMs' responses to specific questions in the patient questionnaire. A: Accuracy; B: Practicality. Error bars represent mean &#x00B1; standard error. ChatGPT-E: ChatGPT-English ; ChatGPT-C: ChatGPT-Chinese; EB: ERNIE Bot.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66429_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Scores and comparisons of LLMs' responses to specific questions in the expert questionnaire. A: Accuracy; B: Practicality. Error bars represent mean &#x00B1; standard error. ChatGPT-E: ChatGPT-English ; ChatGPT-C: ChatGPT-Chinese; EB: ERNIE Bot.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66429_fig03.png"/></fig></sec><sec id="s4-2"><title>Cross-Language Limitations of ChatGPT</title><p>In addition, we found that ChatGPT-C&#x2019;s responses to the 15 questions in the patient and expert questionnaire each contained one notable medical terminology translation error. For instance, &#x201C;&#x6CDB;&#x7D20;&#x916E; (Tamoxifen)&#x201D; and &#x201C;&#x83B1;&#x7279;&#x7F57;&#x5511; (Letrozole)&#x201D; were mentioned in the response to expert questionnaire Q7. In patient questionnaire Q12, the term &#x201C;port&#x201D; in &#x201C;subcutaneously implanted infusion port&#x201D; was translated as &#x201C;harbor&#x201D; (ie, &#x6E2F;&#x53E3; in Chinese). Patients may misinterpret postoperative care requirements due to this nonstandard translation, potentially leading to complications. In the medical domain, English is the primary language for international academic communication. ChatGPT&#x2019;s core training data is predominantly in English, including extensive English medical literature and clinical guidelines (eg, National Comprehensive Cancer Network and Advanced Breast Cancer). Compared with EB, ChatGPT has greater access to and understanding of these professional resources, enabling it to provide more comprehensive and accurate information when answering related questions. The inferior performance of ChatGPT-C compared to ChatGPT-E may stem from limitations in cross-language processing. Although ChatGPT supports multiple languages, the semantic structure and medical terminology in Chinese differ significantly from English. During cross-language processing, ChatGPT may rely on translation mechanisms rather than native Chinese training, leading to semantic distortion in specialized content and reduced answer quality. Preliminary testing in other languages such as Spanish and French has identified similar issues when dealing with proper nouns (eg, medications, surgical procedures) and compound terms. The model tends to rely on literal translation or the creation of neologisms rather than following localized standards, which may lead to ambiguities. However, a recent study by Tian et al [<xref ref-type="bibr" rid="ref19">19</xref>] on CHIMED-GPT found that pretraining GPT using a specific Chinese medical dataset made CHIMED-GPT perform significantly better than other models in tasks such as multiple choice and open-ended responses. To address the translation errors in ChatGPT-C&#x2019;s responses to breast cancer&#x2013;related questions, fine-tuning the model with Chinese medical datasets represents an effective improvement strategy. These datasets should include a wide range of Chinese medical literature, clinical guidelines, case reports, and patient-doctor dialogues specific to the field of breast cancer. By training the model on these specialized datasets would help it grasp the nuances and context of medical language better, leading to more accurate translations and responses. In addition, user error correction interfaces can be designed to allow physicians or patients to flag translation errors (eg, &#x201C;&#x83B1;&#x7279;&#x7F57;&#x5511; &#x2192; &#x6765;&#x66F2;&#x5511;"), and the system could then automatically collect these error cases and add them into fine-tuning datasets, thus achieving continuous model optimization. In terms of response repeatability, the performance of ChatGPT-E (52/60, 86.67%) was significantly better than ChatGPT-C (50/60, 83.33%) and EB (40/60, 66.67%).</p></sec><sec id="s4-3"><title>Challenges Encountered by LLMs</title><p>It is worth noting that we have also found a lack of corresponding empirical data and references to support the views of the two LLMs in their responses, which could undermine the credibility and practicality of their responses, especially in evidence-based clinical practice [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. For example, in responses to the expert questionnaire Q14 and patient questionnaire Q7, both LLMs mentioned that &#x201C;chemotherapy affects fertility or breastfeeding,&#x201D; but failed to provide any useful references, resulting in compromised credibility and possibly inability to guide physicians and patients in making correct decisions. In addition, the responses of LLMs were relatively generalized, indicating that they were widely mentioned but lacked specificity, similar to the findings of the study by Giannakopoulos et al [<xref ref-type="bibr" rid="ref22">22</xref>] who used LLMs to answer dental-related questions. This generalized responses also resulted in the LLMs being less accurate and practical in answering the expert questionnaire than the patient questionnaire (<xref ref-type="fig" rid="figure1">Figure 1</xref>). For example, in the responses to expert questionnaire Q15 and patient questionnaire Q14, although LLMs mentioned the need for regular follow-up and corresponding examinations for patients with breast cancer, they did not provide specific answers. These generalized responses are of limited value to clinical professionals, who require highly accurate, comprehensive, and professional information, similar to previous studies on ChatGPT&#x2019;s responses to mental health and liver cancer&#x2013;related questions [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. However, they may be beneficial for patients with breast cancer who lack medical expertise, as the responses from LLMs already covered the vast majority of the questions and were comparable to clinician responses, similar to the findings from a study by Endo et al [<xref ref-type="bibr" rid="ref25">25</xref>] on the use of LLMs for answering questions related to liver transplantation. Generalized LLM&#x2013;generated information poses risks ranging from clinical mismanagement to ethical violations, particularly in complex fields such as breast cancer. Given that in breast cancer treatment, timely decision-making is critical early diagnosis and intervention significantly improve cure and survival rates. Vague recommendations from LLMs may put patients at risk of missing the optimal therapeutic window, thus potentially exacerbating disease progression. While LLMs such as ChatGPT-E show promise in patient education, their utility depends on the patients&#x2019; ability to contextualize and validate the outputs provided. Patients should maintain a cautious attitude toward responses generated by LLMs that lack personalized recommendations and refrain from relying on them exclusively. It is recommended that patients use the information provided by LLMs as a general reference, while promptly communicating with professional physicians. By integrating their specific clinical circumstances, patients can obtain accurate and personalized medical advice and guidance to safeguard their health and safety.</p></sec><sec id="s4-4"><title>Ethics and Data Security in LLMs</title><p>Furthermore, LLMs have exposed potential ethical risks in responding to breast cancer&#x2013;related questions. While clinical trials may offer access to the latest therapeutic regimens and advanced technologies, they inherently carry uncertainties and potential adverse effects. When responding to Patient Questionnaire Q2 and Expert Questionnaire Q6, LLMs encouraged patient participation in clinical trials without adequately explaining the risks and uncertainties involved. This could lead patients to assume unnecessary risks without being fully informed [<xref ref-type="bibr" rid="ref26">26</xref>]. We recommend establishing a dedicated review team to systematically audit medical recommendations provided by LLMs, particularly regarding clinical trial recommendations and vague suggestions. This ensures that LLMs responses adhere to medical ethical standards and professional norms, and correct or block responses that do not meet the requirements. At the same time, the issue of data security involved in LLMs is becoming increasingly prominent [<xref ref-type="bibr" rid="ref27">27</xref>]. Although the responses of the LLMs in this study did not inadvertently leak sensitive information and were based on general medical knowledge and standardized recommendations, some studies have shown that LLMs may inadvertently memorize and disclose original data in their responses [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. In a study by Nasr et al [<xref ref-type="bibr" rid="ref30">30</xref>], researchers were able to extract training data for various LLMs including ChatGPT through specific &#x201C;attacks&#x201D;. Therefore, doctors or health care organizations should obtain informed consent from patients when using real patient data for model training and application to LLMs, while ensuring the anonymization and deidentification of data [<xref ref-type="bibr" rid="ref26">26</xref>]. Patients should also be trained in data security awareness and instructed to avoid disclosing private personal information while using LLMs to obtain health information [<xref ref-type="bibr" rid="ref31">31</xref>]. China has established a series of laws and regulations, including the Personal Information Protection Law (PIPL) to regulate data processing and applications. However, specific regulatory details for LLM&#x2013;based medical applications are still being explored and refined. The effective implementation of these laws and regulations in the LLM&#x2013;based medical applications is currently a concern [<xref ref-type="bibr" rid="ref32">32</xref>]. Based on the results of this study, future regulation should focus on standardizing the LLM&#x2013;based medical applications. It is essential to strictly regulate data processing procedures to ensure data security and privacy protection in compliance with laws and regulations such as the PIPL. Further, an ethical review mechanism for LLM&#x2013;based medical applications should be established, clarifying ethical guidelines in aspects such as medical information provision and patient decision-making guidance to prevent ethical risks [<xref ref-type="bibr" rid="ref33">33</xref>]. Additionally, explicit limitations should be imposed on the scope and modalities of PIPLs in health care to prevent their excessive involvement in core medical operations when sufficient reliability is not assured.</p></sec><sec id="s4-5"><title>Conclusion</title><p>This study reflects some important issues that may arise when using LLMs in clinical scenarios related to breast cancer in China. Overall, LLMs can serve as effective tools for Chinese patients with breast cancer to obtain health information, helping to address the majority of concerns related to diagnosis, treatment, recovery, and follow up of this population. However, in the context of breast cancer specialists, the accuracy, practicality, and relevance of LLMs&#x2019; responses need improvement. We propose a multidimensional optimization framework to enhance the utility and reliability of ChatGPT in breast cancer diagnosis and management. On the one hand, the model should be trained using high-quality medical data, such as the latest breast cancer research, clinical guidelines, and case reports, to improve its accuracy and practicality in the professional domain. On the other hand, under ethical compliance, ChatGPT should be connected to deidentified electronic health records, laboratory systems, and imaging databases to access real-time patient data and provide more personalized recommendations. Based on our research, ChatGPT-E demonstrates better repeatability, accuracy, and practicality in its responses compared to other LLMs. Therefore, it is recommended that Chinese patients with breast cancer translate their questions into English before querying ChatGPT, to improve its effectiveness. In addition, considering the potential data security, ethical, and legal risks of LLMs in clinical practice, it is essential to strengthen regulation of the training and application of LLMs in the medical professional field [<xref ref-type="bibr" rid="ref34">34</xref>]. This study has certain limitations as the response from LLMs were not applied in real time to address the questions of patients with breast cancer or to assist doctors in making clinical decisions. We also did not evaluate all issues related to breast cancer. The data collection for this study was completed in January 2024. However, certain models (eg, GPT-4-turbo and DeepSeek) had not been publicly released at that time or failed to provide stable interfaces for academic research applications and therefore were not included in this study. Additionally, this study used structured questionnaires to evaluate the responses of LLMs, which ensured standardized assessment but partially limited the assessment of LLMs&#x2019; ability to handle open-ended, unstructured, and interactive questions. Future iterations could incorporate open-ended or interactive question types to better simulate real-world clinical consultations. Lastly, patient and expert user feedback can provide critical user-perspective data, address the limitations of existing expert-only evaluations, and enhance the application effectiveness and user experience of LLMs in health care. Further research is required to evaluate the real-world clinical effectiveness of LLMs and the real user experience of patients with breast cancer in China.</p></sec></sec></body><back><ack><p>Senior Medical Talents Program of Chongqing for Young and Middle-aged (NO: 2023171&#x2010;21, 202374&#x2010;04).</p></ack><notes><sec><title>Data Availability</title><p>All data generated or analyzed in this study are included in this published article and its supplementary information files.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: GY, H Liu, JP</p><p>Data curation: H Luo, H Liu, JP</p><p>Formal analysis: AD, H Luo, JP, LL, XH</p><p>Funding acquisition: H Luo</p><p>Investigation: H Liu, JP, LL, XH</p><p>Methodology: AD, JP</p><p>Project administration: AD, H Luo, GY</p><p>Resources: XH</p><p>Software: LL</p><p>Supervision: GY, H Luo, LL</p><p>Validation: AD, GY, H Luo</p><p>Visualization: H Liu, LL</p><p>Writing-original draft: H Liu, JP</p><p>Writing - review &#x0026; editing: AD, GY, H Luo, LL, XH</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ChatGPT-C</term><def><p>ChatGPT-Chinese</p></def></def-item><def-item><term id="abb2">ChatGPT-E</term><def><p>ChatGPT-English</p></def></def-item><def-item><term id="abb3">EB</term><def><p>ERNIE Bot</p></def></def-item><def-item><term id="abb4">GSS</term><def><p>generalization-specificity score</p></def></def-item><def-item><term id="abb5">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">PIPL</term><def><p>Personal Information Protection Law</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sung</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ferlay</surname><given-names>J</given-names> </name><name name-style="western"><surname>Siegel</surname><given-names>RL</given-names> </name><etal/></person-group><article-title>Global Cancer Statistics 2020: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title><source>CA Cancer J Clin</source><year>2021</year><month>05</month><volume>71</volume><issue>3</issue><fpage>209</fpage><lpage>249</lpage><pub-id pub-id-type="doi">10.3322/caac.21660</pub-id><pub-id pub-id-type="medline">33538338</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cao</surname><given-names>W</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>HD</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>YW</given-names> </name><name name-style="western"><surname>Li</surname><given-names>N</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>WQ</given-names> </name></person-group><article-title>Changing profiles of cancer burden worldwide and in China: a secondary analysis of the global cancer statistics 2020</article-title><source>Chin Med J</source><year>2021</year><volume>134</volume><issue>7</issue><fpage>783</fpage><lpage>791</lpage><pub-id pub-id-type="doi">10.1097/CM9.0000000000001474</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Alteri</surname><given-names>R</given-names> </name><name name-style="western"><surname>Barnes</surname><given-names>C</given-names> </name><name name-style="western"><surname>Burke</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Breast cancer facts &#x0026; figures 2013-2014</article-title><year>2013</year><publisher-name>American Cancer Society</publisher-name></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rosmawati</surname><given-names>NHN</given-names> </name></person-group><article-title>Knowledge, attitudes and practice of breast self-examination among women in a suburban area in Terengganu, Malaysia</article-title><source>Asian Pac J Cancer Prev</source><year>2010</year><volume>11</volume><issue>6</issue><fpage>1503</fpage><lpage>1508</lpage><pub-id pub-id-type="medline">21338188</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Berry</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Cronin</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Plevritis</surname><given-names>SK</given-names> </name><etal/></person-group><article-title>Effect of screening and adjuvant therapy on mortality from breast cancer</article-title><source>N Engl J Med</source><year>2005</year><month>10</month><day>27</day><volume>353</volume><issue>17</issue><fpage>1784</fpage><lpage>1792</lpage><pub-id pub-id-type="doi">10.1056/NEJMoa050518</pub-id><pub-id pub-id-type="medline">16251534</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><article-title>One in two EU citizens look for health information online</article-title><source>Eurostat</source><year>2021</year><access-date>2025-05-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ec.europa.eu/eurostat/web/products-eurostat-news/-/edn-20210406-1">https://ec.europa.eu/eurostat/web/products-eurostat-news/-/edn-20210406-1</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kong</surname><given-names>H</given-names> </name></person-group><article-title>Online health information seeking: a review and meta-analysis</article-title><source>Health Commun</source><year>2021</year><month>09</month><volume>36</volume><issue>10</issue><fpage>1163</fpage><lpage>1175</lpage><pub-id pub-id-type="doi">10.1080/10410236.2020.1748829</pub-id><pub-id pub-id-type="medline">32290679</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Agricola</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gesualdo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Pandolfi</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Does Googling for preconception care result in information consistent with international guidelines: a comparison of information found by Italian women of childbearing age and health professionals</article-title><source>BMC Med Inform Decis Mak</source><year>2013</year><month>01</month><day>25</day><volume>13</volume><fpage>14</fpage><pub-id pub-id-type="doi">10.1186/1472-6947-13-14</pub-id><pub-id pub-id-type="medline">23347453</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>PP</given-names> </name></person-group><article-title>YouTube for information on rheumatoid arthritis--a wakeup call?</article-title><source>J Rheumatol</source><year>2012</year><month>05</month><volume>39</volume><issue>5</issue><fpage>899</fpage><lpage>903</lpage><pub-id pub-id-type="doi">10.3899/jrheum.111114</pub-id><pub-id pub-id-type="medline">22467934</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Parsons</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dorff</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Cancer misinformation and harmful information on Facebook and other social media: a brief report</article-title><source>JNCI</source><year>2022</year><month>07</month><day>11</day><volume>114</volume><issue>7</issue><fpage>1036</fpage><lpage>1039</lpage><pub-id pub-id-type="doi">10.1093/jnci/djab141</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>SB</given-names> </name><name name-style="western"><surname>King</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Warner</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Aneja</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kann</surname><given-names>BH</given-names> </name><name name-style="western"><surname>Bylund</surname><given-names>CL</given-names> </name></person-group><article-title>Using ChatGPT to evaluate cancer myths and misconceptions: artificial intelligence and cancer information</article-title><source>JNCI Cancer Spectr</source><year>2023</year><month>03</month><day>1</day><volume>7</volume><issue>2</issue><pub-id pub-id-type="doi">10.1093/jncics/pkad015</pub-id><pub-id pub-id-type="medline">36929393</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hopkins</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Logan</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Kichenadasse</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sorich</surname><given-names>MJ</given-names> </name></person-group><article-title>Artificial intelligence chatbots will revolutionize how cancer patients access information: ChatGPT represents a paradigm-shift</article-title><source>JNCI Cancer Spectr</source><year>2023</year><month>03</month><day>1</day><volume>7</volume><issue>2</issue><fpage>pkad010</fpage><pub-id pub-id-type="doi">10.1093/jncics/pkad010</pub-id><pub-id pub-id-type="medline">36808255</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Alessandri Bonetti</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pandya</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>VT</given-names> </name><name name-style="western"><surname>Egro</surname><given-names>FM</given-names> </name></person-group><article-title>Dr. ChatGPT will see you now: How do Google and ChatGPT compare in answering patient questions on breast reconstruction?</article-title><source>J Plast Reconstr Aesthet Surg</source><year>2023</year><month>10</month><volume>85</volume><fpage>488</fpage><lpage>497</lpage><pub-id pub-id-type="doi">10.1016/j.bjps.2023.07.039</pub-id><pub-id pub-id-type="medline">37598590</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>A</given-names> </name></person-group><article-title>Enhancing diabetes self-management and education: a critical analysis of ChatGPT&#x2019;s role</article-title><source>Ann Biomed Eng</source><year>2024</year><month>04</month><volume>52</volume><issue>4</issue><fpage>741</fpage><lpage>744</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03317-8</pub-id><pub-id pub-id-type="medline">37553556</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gradishar</surname><given-names>WJ</given-names> </name><name name-style="western"><surname>Moran</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Abraham</surname><given-names>J</given-names> </name><etal/></person-group><article-title>NCCN Guidelines&#x00AE; Insights: Breast Cancer, Version 4.2023</article-title><source>J Natl Compr Canc Netw</source><year>2023</year><month>06</month><volume>21</volume><issue>6</issue><fpage>594</fpage><lpage>608</lpage><pub-id pub-id-type="doi">10.6004/jnccn.2023.0031</pub-id><pub-id pub-id-type="medline">37308117</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cardoso</surname><given-names>F</given-names> </name><name name-style="western"><surname>Paluch-Shimon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Senkus</surname><given-names>E</given-names> </name><etal/></person-group><article-title>5th ESO-ESMO international consensus guidelines for advanced breast cancer (ABC 5)</article-title><source>Ann Oncol</source><year>2020</year><month>12</month><volume>31</volume><issue>12</issue><fpage>1623</fpage><lpage>1649</lpage><pub-id pub-id-type="doi">10.1016/j.annonc.2020.09.010</pub-id><pub-id pub-id-type="medline">32979513</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rus</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Chiric&#x0103;</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ra&#x0163;iu</surname><given-names>L</given-names> </name><name name-style="western"><surname>B&#x0103;ban</surname><given-names>A</given-names> </name></person-group><article-title>Learning organization and social responsibility in Romanian higher education institutions</article-title><source>Procedia Soc Behav Sci</source><year>2014</year><month>08</month><volume>142</volume><fpage>146</fpage><lpage>153</lpage><pub-id pub-id-type="doi">10.1016/j.sbspro.2014.07.628</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YS</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>CN</given-names> </name></person-group><article-title>Correlation between the Likert Scale and the Numeric Rating Scale for evaluating knee pain</article-title><source>J Korean Knee Soc</source><year>2011</year><volume>23</volume><issue>1</issue><fpage>14</fpage><pub-id pub-id-type="doi">10.5792/jkks.2011.23.1.14</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tian</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>ChiMed-GPT: a Chinese medical large language model with full training regime and better alignment to human preferences</article-title><source>arXiv</source><comment>Preprint posted online on 2023</comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Djulbegovic</surname><given-names>B</given-names> </name><name name-style="western"><surname>Guyatt</surname><given-names>GH</given-names> </name></person-group><article-title>Progress in evidence-based medicine: a quarter century on</article-title><source>Lancet</source><year>2017</year><month>07</month><day>22</day><volume>390</volume><issue>10092</issue><fpage>415</fpage><lpage>423</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(16)31592-6</pub-id><pub-id pub-id-type="medline">28215660</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Musheyev</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Loeb</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kabarriti</surname><given-names>AE</given-names> </name></person-group><article-title>How well do artificial intelligence chatbots respond to the top search queries about urological malignancies?</article-title><source>Eur Urol</source><year>2024</year><month>01</month><volume>85</volume><issue>1</issue><fpage>13</fpage><lpage>16</lpage><pub-id pub-id-type="doi">10.1016/j.eururo.2023.07.004</pub-id><pub-id pub-id-type="medline">37567827</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giannakopoulos</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kavadella</surname><given-names>A</given-names> </name><name name-style="western"><surname>Aaqel Salim</surname><given-names>A</given-names> </name><name name-style="western"><surname>Stamatopoulos</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kaklamanos</surname><given-names>EG</given-names> </name></person-group><article-title>Evaluation of the performance of generative ai large language models ChatGPT, Google Bard, and Microsoft Bing Chat in supporting evidence-based dentistry: comparative mixed methods study</article-title><source>J Med Internet Res</source><year>2023</year><month>12</month><day>28</day><volume>25</volume><fpage>e51580</fpage><pub-id pub-id-type="doi">10.2196/51580</pub-id><pub-id pub-id-type="medline">38009003</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Farhat</surname><given-names>F</given-names> </name></person-group><article-title>ChatGPT as a complementary mental health resource: a boon or a bane</article-title><source>Ann Biomed Eng</source><year>2024</year><month>05</month><volume>52</volume><issue>5</issue><fpage>1111</fpage><lpage>1114</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03326-7</pub-id><pub-id pub-id-type="medline">37477707</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cao</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Kwon</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Ghaziani</surname><given-names>TT</given-names> </name><etal/></person-group><article-title>Accuracy of information provided by chatgpt regarding liver cancer surveillance and diagnosis</article-title><source>AJR Am J Roentgenol</source><year>2023</year><month>10</month><volume>221</volume><issue>4</issue><fpage>556</fpage><lpage>559</lpage><pub-id pub-id-type="doi">10.2214/AJR.23.29493</pub-id><pub-id pub-id-type="medline">37222278</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Endo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sasaki</surname><given-names>K</given-names> </name><name name-style="western"><surname>Moazzam</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Quality of ChatGPT responses to questions related to liver transplantation</article-title><source>J Gastrointest Surg</source><year>2023</year><month>08</month><volume>27</volume><issue>8</issue><fpage>1716</fpage><lpage>1719</lpage><pub-id pub-id-type="doi">10.1007/s11605-023-05714-9</pub-id><pub-id pub-id-type="medline">37254022</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name></person-group><article-title>Utility of ChatGPT in clinical practice</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>28</day><volume>25</volume><fpage>e48568</fpage><pub-id pub-id-type="doi">10.2196/48568</pub-id><pub-id pub-id-type="medline">37379067</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adhikari</surname><given-names>K</given-names> </name><name name-style="western"><surname>Naik</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hameed</surname><given-names>BZ</given-names> </name><name name-style="western"><surname>Raghunath</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Somani</surname><given-names>BK</given-names> </name></person-group><article-title>Exploring the ethical, legal, and social implications of ChatGPT in urology</article-title><source>Curr Urol Rep</source><year>2024</year><month>01</month><volume>25</volume><issue>1</issue><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1007/s11934-023-01185-2</pub-id><pub-id pub-id-type="medline">37735339</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name></person-group><article-title>Ethical considerations of using ChatGPT in health care</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>11</day><volume>25</volume><fpage>e48009</fpage><pub-id pub-id-type="doi">10.2196/48009</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eppler</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ganjavi</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ramacciotti</surname><given-names>LS</given-names> </name><etal/></person-group><article-title>Awareness and use of ChatGPT and large language models: a prospective cross-sectional global survey in urology</article-title><source>Eur Urol</source><year>2024</year><month>02</month><volume>85</volume><issue>2</issue><fpage>146</fpage><lpage>153</lpage><pub-id pub-id-type="doi">10.1016/j.eururo.2023.10.014</pub-id><pub-id pub-id-type="medline">37926642</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nasr</surname><given-names>M</given-names> </name><name name-style="western"><surname>Carlini</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hayase</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Scalable extraction of training data from (production) language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 28, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.17035</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name></person-group><article-title>Security Implications of AI chatbots in health care</article-title><source>J Med Internet Res</source><year>2023</year><month>11</month><day>28</day><volume>25</volume><fpage>e47551</fpage><pub-id pub-id-type="doi">10.2196/47551</pub-id><pub-id pub-id-type="medline">38015597</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>The imperative for regulatory oversight of large language models (or generative AI) in healthcare</article-title><source>NPJ Digit Med</source><year>2023</year><month>07</month><day>6</day><volume>6</volume><issue>1</issue><fpage>120</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00873-0</pub-id><pub-id pub-id-type="medline">37414860</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohamed</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dunnett</surname><given-names>S</given-names> </name><name name-style="western"><surname>Flores</surname><given-names>A</given-names> </name><name name-style="western"><surname>Loew</surname><given-names>E</given-names> </name><name name-style="western"><surname>Pienaar</surname><given-names>S</given-names> </name><collab>MILE (Medical Information Leaders in Europe)</collab></person-group><article-title>A principles framework for digital provision of medical information for healthcare professionals</article-title><source>Pharmaceut Med</source><year>2023</year><month>03</month><volume>37</volume><issue>2</issue><fpage>103</fpage><lpage>109</lpage><pub-id pub-id-type="doi">10.1007/s40290-023-00464-0</pub-id><pub-id pub-id-type="medline">37000411</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coskun</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ocakoglu</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yetemen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaygisiz</surname><given-names>O</given-names> </name></person-group><article-title>Can ChatGPT, an artificial intelligence language model, provide accurate and high-quality patient information on prostate cancer?</article-title><source>Urology</source><year>2023</year><month>10</month><volume>180</volume><fpage>35</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.1016/j.urology.2023.05.040</pub-id><pub-id pub-id-type="medline">37406864</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Specific contents of patient questionnaire and expert questionnaire and LLMs' response to them.</p><media xlink:href="medinform_v13i1e66429_app1.docx" xlink:title="DOCX File, 129 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Comparison of generalization-specificity score (GSS).</p><media xlink:href="medinform_v13i1e66429_app2.png" xlink:title="PNG File, 1765 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Percentage distribution of ratings.</p><media xlink:href="medinform_v13i1e66429_app3.png" xlink:title="PNG File, 1622 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Comparison of overall accuracy, practicality, and generalization-specificity score (GSS) between patient and expert questionnaires using Mann-Whitney U Test.</p><media xlink:href="medinform_v13i1e66429_app4.docx" xlink:title="DOCX File, 12 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Multiple hypothesis testing (Dunn test) results for overall accuracy, practicality, and generalization-specificity score (GSS) of patient questionnaires among different models.</p><media xlink:href="medinform_v13i1e66429_app5.docx" xlink:title="DOCX File, 12 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Multiple hypothesis testing (Dunn test) results for overall accuracy, practicality, and generalization-specificity score (GSS) of expert questionnaires among different models.</p><media xlink:href="medinform_v13i1e66429_app6.docx" xlink:title="DOCX File, 12 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Results of multiple hypothesis tests (Dunn test) on the accuracy, practicality, and generalization-specificity score (GSS) of specific questions of patient questionnaires across different models.</p><media xlink:href="medinform_v13i1e66429_app7.docx" xlink:title="DOCX File, 22 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Results of multiple hypothesis tests (Dunn test) on the accuracy, practicality, and generalization-specificity score (GSS) of specific questions of expert questionnaires across different models.</p><media xlink:href="medinform_v13i1e66429_app8.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material></app-group></back></article>