<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e63924</article-id><article-id pub-id-type="doi">10.2196/63924</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of Artificial Intelligence Chatbots on Ultrasound Examinations: Cross-Sectional Comparative Analysis</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Yong</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lu</surname><given-names>Xiao</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Luo</surname><given-names>Yan</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Ying</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ling</surname><given-names>Wenwu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Medical Ultrasound, West China Hospital of Sichuan University</institution><addr-line>37 Guoxue Alley</addr-line><addr-line>Chengdu</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Thoracic Surgery, West China Hospital of Sichuan University</institution><addr-line>Chengdu</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Castonguay</surname><given-names>Alexandre</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Thies</surname><given-names>Bill</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Chenxu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kufel</surname><given-names>Jakub</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chatzimina</surname><given-names>Maria</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Wenwu Ling, PhD, Department of Medical Ultrasound, West China Hospital of Sichuan University, 37 Guoxue Alley, Chengdu, 610041, China, 86 18980605569; <email>496273016@qq.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>9</day><month>1</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e63924</elocation-id><history><date date-type="received"><day>03</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>23</day><month>10</month><year>2024</year></date><date date-type="accepted"><day>19</day><month>11</month><year>2024</year></date></history><copyright-statement>&#x00A9; Yong Zhang, Xiao Lu, Yan Luo, Ying Zhu, Wenwu Ling. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 9.1.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e63924"/><abstract><sec><title>Background</title><p>Artificial intelligence chatbots are being increasingly used for medical inquiries, particularly in the field of ultrasound medicine. However, their performance varies and is influenced by factors such as language, question type, and topic.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the performance of ChatGPT and ERNIE Bot in answering ultrasound-related medical examination questions, providing insights for users and developers.</p></sec><sec sec-type="methods"><title>Methods</title><p>We curated 554 questions from ultrasound medicine examinations, covering various question types and topics. The questions were posed in both English and Chinese. Objective questions were scored based on accuracy rates, whereas subjective questions were rated by 5 experienced doctors using a Likert scale. The data were analyzed in Excel.</p></sec><sec sec-type="results"><title>Results</title><p>Of the 554 questions included in this study, single-choice questions comprised the largest share (354/554, 64%), followed by short answers (69/554, 12%) and noun explanations (63/554, 11%). The accuracy rates for objective questions ranged from 8.33% to 80%, with true or false questions scoring highest. Subjective questions received acceptability rates ranging from 47.62% to 75.36%. ERNIE Bot was superior to ChatGPT in many aspects (<italic>P</italic>&#x003C;.05). Both models showed a performance decline in English, but ERNIE Bot&#x2019;s decline was less significant. The models performed better in terms of basic knowledge, ultrasound methods, and diseases than in terms of ultrasound signs and diagnosis.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Chatbots can provide valuable ultrasound-related answers, but performance differs by model and is influenced by language, question type, and topic. In general, ERNIE Bot outperforms ChatGPT. Users and developers should understand model performance characteristics and select appropriate models for different questions and languages to optimize chatbot use.</p></sec></abstract><kwd-group><kwd>chatbots</kwd><kwd>ChatGPT</kwd><kwd>ERNIE Bot</kwd><kwd>performance</kwd><kwd>accuracy rates</kwd><kwd>ultrasound</kwd><kwd>language</kwd><kwd>examination</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>With the rapid development of artificial intelligence (AI) technology, deep learning models are being increasingly and widely used in various fields, especially in natural language processing and computer vision [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. In the field of natural language processing, several large pretrained models, such as OpenAI&#x2019;s ChatGPT and Baidu&#x2019;s ERNIE Bot [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], have demonstrated strong text generation and understanding capabilities. These models acquire rich semantic knowledge and language patterns through pretraining on large-scale corpora, which enables them to handle various complex natural language tasks [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. In recent years, researchers have explored new algorithms and frameworks to optimize the performance of models and improve their accuracy and efficiency in handling complex tasks. In these studies, model selection, training, evaluation, and performance in practical applications have become the focus of research [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. The proportion of medical health&#x2013;related knowledge obtained through the internet is large, and chatbots are also used to answer various medical questions [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Researchers have performed many evaluations and studies on chatbots to answer medical questions, including ophthalmology, pediatric, urology, dentistry, and other professional directions, involving myopia, cirrhosis, hypertension, obesity, and other diseases and medical examination questions [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>With the rapid development of ultrasound medicine, the demand for ultrasound examination is increasing, and the teaching and popularization of ultrasound is limited. An increasing number of junior ultrasound doctors, students, and patients have begun to use chatbots to obtain ultrasound-related consultation and answers. However, current research shows that chatbot performance is uneven; in some areas or tasks, chatbot performance can reach more than 90% of the accuracy rate or satisfaction, and chatbot performance can even exceed that of some doctors; however, in some tasks, the answer provided is not valid or even wrong [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. There are performance differences among different models, which are also affected by many factors, such as language, question type, and topic [<xref ref-type="bibr" rid="ref15">15</xref>]. An in-depth understanding of how models perform in various domains and under various conditions is necessary and valuable for both users and developers [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>ChatGPT is a large-scale language model based on the transformer architecture developed by OpenAI, an American AI research laboratory. It simulates the process of human language generation and understanding through deep learning technology and adopts an autoregressive language modeling method to predict the next word or phrase in the text sequence to generate coherent text [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. ChatGPT training data are derived from massive amounts of text data on the internet, including news reports, academic articles, and social media content. After the data are cleaned and labeled, the data are used to train the parameters of the model so that it can capture the complex patterns and semantic relationships of natural language [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. ChatGPT&#x2019;s powerful language generation and context understanding capabilities enable it to automatically generate relevant and coherent responses based on the input text content, enabling natural interaction with human users and completing multiple linguistic tasks, such as questions and answers, text summaries, and sentiment analysis [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>ERNIE Bot is an intelligent question-and-answer system based on deep learning technology created by China&#x2019;s Baidu. Its basic principle is to analyze and understand the questions raised by users by natural language processing technology, convert the questions of users into a form that computers can understand, and extract relevant information from massive amounts of data [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Natural language generation models are then used to transform the extracted information into a form that humans can understand, generate answers, and return them to the user. ERNIE Bot has a large amount of data, including trillions of web data, billions of search data and image data, billions of daily voice call data, and 550 billion facts in the knowledge graph [<xref ref-type="bibr" rid="ref25">25</xref>]. ERNIE Bot uses advanced natural language processing algorithms to accurately understand user questions and provide precise answers. Furthermore, it supports personalized customization according to different scenarios and needs and can continue to learn and accumulate knowledge to improve intelligence.</p><p>The purpose of this cross-sectional study is (1) to evaluate and compare the performance of ChatGPT and ERNIE Bot in answering questions in ultrasound examination papers; (2) to comprehensively analyze and compare the performance differences of the models in different question types, topics, and input language environments; and (3) to explore the reasons for these differences. This study is expected to provide insight for chatbot users and developers so that we can better understand the performance of various models in different fields. For more complex medical problems and fields, we can attempt to combine the advantages of multiple models to make an objective comprehensive judgment and consider the results. Chatbots can provide better services while constantly improving their own performance.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Question Curation</title><p>We used questions from ultrasound examination papers as a data set. The questions, from the West China Clinical College of Medicine of Sichuan University, covered basic knowledge of ultrasound medicine, the digestive system, the urinary system, superficial organs, blood vessels, and the heart. The question types included single-choice, multiple-choice, true or false questions; noun explanations; and short answers. With a total of 584 questions, we excluded picture-related questions, repetitive questions, questions with poor grammar, and questions with subjective answers from this cross-sectional study. Finally, a total of 554 questions were included, and the flowchart of the questions suitable for inclusion in the study is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the questions included in the study.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e63924_fig01.png"/></fig></sec><sec id="s2-2"><title>Response Generation and Grading</title><p>Each question was entered into GPT-3.5 Turbo (GPT-3.5) and ERNIE Bot-3.5 (Bot-3.5) in both English and Chinese, and the responses were recorded. For each input question, the background definition of the question, and the explanation of the question type are given, such as the noun explanation of the undergraduate superficial organ ultrasound examination. An example of the questioning methods for different types of questions is shown in <xref ref-type="other" rid="box1">Textbox 1</xref>. The subjective questions in this cross-sectional study included noun interpretation questions and short answer questions. All the subjective questions were scored by 5 experts in the field of ultrasound medicine who are fluent in both Chinese and English and have more than 20 years of experience in the fields of ultrasound diagnosis and ultrasound teaching. The experts rated the comprehensive quality of the responses in both languages on a Likert scale [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>], which is an effective, objective, and fair evaluation method for quantifying and grading answers. The evaluation of subjective responses in this study includes completeness, logical clarity, accuracy, depth and breadth, creativity, and so on. A higher score indicates higher quality (1 point: very poor, the answer content seriously deviates from the requirements of the question, the logic is confused, and there is basically no correct content; 2 points: poor, part of the answer deviates from the requirements of the question, contains some correct content, but the logic is not clear enough, and there are some mistakes or omissions; 3 points: acceptable, the answers basically meet the requirements of the question, the content is relatively complete, the logic is relatively clear, but some details or explanations are not accurate or in depth; 4 points: good, the answer fully meets the requirements of the question, and the content is accurate, complete, and logical; and 5 points: very good, the answers fully meet the requirements of the question, the content is accurate and in depth, comprehensive, logical and rigorous, and even new insights or solutions are proposed). The experiments were completed in October 2024.</p><boxed-text id="box1"><title> Examples of how to ask questions, including the content of the question, background, and question type description.</title><p><bold>Example A</bold></p><list list-type="simple"><list-item><p>The following is a single-choice question for the ultrasound examination. Please select the most appropriate one from the options given.</p></list-item></list><list list-type="bullet"><list-item><p>Which of the following is true about ultrasound?</p></list-item></list><list list-type="bullet"><list-item><p><named-content content-type="indent">&#x2003;</named-content>Ultrasound is an electromagnetic wave with a strong penetrating force.</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>The commonly used frequency range of medical ultrasound is 2.5-12 MHz.</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>Ultrasound wave is a wave with a frequency greater than 2000 Hz.</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>The form of ultrasound wave propagation in the human body is mainly shear wave.</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>Ultrasound is not easily affected by gas and bone.</p></list-item></list><p><bold>Example B</bold></p><list list-type="simple"><list-item><p>The following is a multiple-choice question for the superficial organ ultrasound examination. Please select two or more of the correct answers from the options given.</p></list-item></list><list list-type="bullet"><list-item><p>To which layers is the mammary gland subdivided by the fascia layer?</p></list-item></list><list list-type="bullet"><list-item><p><named-content content-type="indent">&#x2003;</named-content>Skin layer</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>Subcutaneous fat layer</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>Glandular (parenchymal) layer</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>Fat layer in the retromammary space</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>Chest wall layer</p></list-item></list><p><bold>Example C</bold></p><list list-type="simple"><list-item><p>The following is a judgment question for the ultrasound imaging examination. Please judge whether the following description is correct or not.</p></list-item></list><list list-type="bullet"><list-item><p>An infective endocarditis patient can definitely detect growth on ultrasound.</p></list-item></list><p><bold>Example D</bold></p><list list-type="simple"><list-item><p>The following is a noun explanation question for the superficial organ ultrasound examination. Please make an appropriate explanation of the following nouns.</p></list-item></list><list list-type="bullet"><list-item><p>Acoustic impedance</p></list-item></list><p><bold>Example E</bold></p><list list-type="simple"><list-item><p>The following is a short answer question for the ultrasound examination.</p></list-item></list><list list-type="bullet"><list-item><p>Please briefly describe the common etiology and typical ultrasound features of cirrhosis.</p></list-item></list></boxed-text></sec><sec id="s2-3"><title>Statistical Analysis</title><p>For statistical analysis purposes, all the questions were grouped into categories: basic knowledge, disease and etiology, ultrasound examination, ultrasound diagnosis, ultrasound signs, case analysis, etc. We used Microsoft Excel to conduct statistical analyses (version 2019; Microsoft Corporation).</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>Ethics approval was not required since the research did not involve human subjects or animals.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>All the questions included in this cross-sectional study are from real ultrasound examination papers, and the proportions of question types and categories are highly representative. The types and categories of all the questions included in this study are shown in <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>. Among the 554 eligible questions included in this study, according to the classification of question types, single-choice answers accounted for the highest proportion (354/554, 64%), followed by short-choice answers (69/554, 12%) and noun explanations (63/554, 11%), and the rest were multiple-choice and true or false questions. According to the classification of topics, the greatest proportion of topics were basic knowledge (181/554, 33%), followed by disease and etiology (106/554, 19%) and ultrasound signs (96/554, 17%), and the rest were ultrasound examination, ultrasound diagnosis, and ultrasound case analysis.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Questions types and categories included in this study.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Categories</td><td align="left" valign="bottom">Basics, n</td><td align="left" valign="bottom">Examinations, n</td><td align="left" valign="bottom">Diagnosis, n</td><td align="left" valign="bottom">Cases, n</td><td align="left" valign="bottom">Disease, n</td><td align="left" valign="bottom">Signs, n</td><td align="left" valign="bottom">Total, n</td></tr></thead><tbody><tr><td align="left" valign="top">Single choice</td><td align="left" valign="top">124</td><td align="left" valign="top">49</td><td align="left" valign="top">41</td><td align="left" valign="top">26</td><td align="left" valign="top">63</td><td align="left" valign="top">51</td><td align="left" valign="top">354</td></tr><tr><td align="left" valign="top">Multiple choice</td><td align="left" valign="top">13</td><td align="left" valign="top">8</td><td align="left" valign="top">6</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">12</td><td align="left" valign="top">9</td><td align="left" valign="top">48</td></tr><tr><td align="left" valign="top">True or false</td><td align="left" valign="top">10</td><td align="left" valign="top">7</td><td align="left" valign="top">3</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">Noun explanation</td><td align="left" valign="top">21</td><td align="left" valign="top">8</td><td align="left" valign="top">3</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">24</td><td align="left" valign="top">7</td><td align="left" valign="top">63</td></tr><tr><td align="left" valign="top">Short answer</td><td align="left" valign="top">13</td><td align="left" valign="top">10</td><td align="left" valign="top">10</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">7</td><td align="left" valign="top">29</td><td align="left" valign="top">69</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">181</td><td align="left" valign="top">82</td><td align="left" valign="top">63</td><td align="left" valign="top">26</td><td align="left" valign="top">106</td><td align="left" valign="top">96</td><td align="left" valign="top">554</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Study results for each question category, stratified by question style (single-choice, multiple-choice, and true or false questions), language (English and Chinese), and artificial intelligence model (GPT-3.5 Turbo [GPT-3.5] and ERNIE Bot-3.5 [Bot-3.5]).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Categories</td><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Basics, %</td><td align="left" valign="bottom">Examination, %</td><td align="left" valign="bottom">Diagnosis, %</td><td align="left" valign="bottom">Cases, %</td><td align="left" valign="bottom">Disease, %</td><td align="left" valign="bottom">Signs, %</td><td align="left" valign="bottom">Total, %</td></tr></thead><tbody><tr><td align="left" valign="top">Single choice (English)</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">58.87</td><td align="left" valign="top">59.18</td><td align="left" valign="top">58.54</td><td align="left" valign="top">57.69</td><td align="left" valign="top">50.79</td><td align="left" valign="top">58.82</td><td align="left" valign="top">57.34</td></tr><tr><td align="left" valign="top">Correct (%)</td><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">57.26</td><td align="left" valign="top">61.22</td><td align="left" valign="top">58.54</td><td align="left" valign="top">61.54</td><td align="left" valign="top">63.49</td><td align="left" valign="top">60.78</td><td align="left" valign="top">59.89</td></tr><tr><td align="left" valign="top">Single choice (Chinese)</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">60.48</td><td align="left" valign="top">61.22</td><td align="left" valign="top">63.41</td><td align="left" valign="top">57.69</td><td align="left" valign="top">53.97</td><td align="left" valign="top">60.78</td><td align="left" valign="top">59.6</td></tr><tr><td align="left" valign="top">Correct (%)</td><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">58.06</td><td align="left" valign="top">65.31</td><td align="left" valign="top">56.1</td><td align="left" valign="top">65.38</td><td align="left" valign="top">68.25</td><td align="left" valign="top">70.59</td><td align="left" valign="top">62.99</td></tr><tr><td align="left" valign="top">Multiple choice (English)</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">7.69</td><td align="left" valign="top">0</td><td align="left" valign="top">16.67</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">8.33</td><td align="left" valign="top">11.11</td><td align="left" valign="top">8.33</td></tr><tr><td align="left" valign="top">Correct (%)</td><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">46.15</td><td align="left" valign="top">12.5</td><td align="left" valign="top">16.67</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">25.0</td><td align="left" valign="top">66.67</td><td align="left" valign="top">35.42</td></tr><tr><td align="left" valign="top">Multiple choice (Chinese)</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">7.69</td><td align="left" valign="top">0</td><td align="left" valign="top">16.67</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">16.67</td><td align="left" valign="top">11.11</td><td align="left" valign="top">10.42</td></tr><tr><td align="left" valign="top">Correct (%)</td><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">53.85</td><td align="left" valign="top">12.5</td><td align="left" valign="top">16.67</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">33.33</td><td align="left" valign="top">66.67</td><td align="left" valign="top">39.58</td></tr><tr><td align="left" valign="top">True or false (English)</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">50</td><td align="left" valign="top">57.14</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">60</td></tr><tr><td align="left" valign="top">Correct (%)</td><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">60</td><td align="left" valign="top">85.71</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">75</td></tr><tr><td align="left" valign="top">True or false (Chinese)</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">50</td><td align="left" valign="top">71.43</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">65</td></tr><tr><td align="left" valign="top">Correct (%)</td><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">70</td><td align="left" valign="top">85.71</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">80</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>We collected the accuracy rate results of the 2 AI models, GPT-3.5 and Bot-3.5, for single-choice, multiple-choice, and true or false questions in English and Chinese. When the models were asked questions in Chinese (the original language of the test paper), the overall accuracy rate was as follows (GPT-3.5 vs Bot-3.5): single-choice (59.6% vs 62.99%), multiple-choice (10.42% vs 39.58%), and true or false questions (65% vs 80%). When the test paper was translated into English for questioning, the overall accuracy rates were as follows (GPT-3.5 vs Bot-3.5): single-choice (57.34% vs 59.89%), multiple-choice (8.33% vs 35.42%), and true or false questions (60% vs 75%). All translations are performed manually by experts who are proficient in English. It can be clearly seen that Bot-3.5 is superior to GPT-3.5 in all question types and languages. Furthermore, we calculated classified statistics according to the accuracy rates of different categories of questions, and the statistical results are shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p><p>We collected the scores of the GPT-3.5 and Bot-3.5 AI models for noun interpretation and short answers in both Chinese and English. A total of 63 noun explanation questions and 69 short answer questions were included in this cross-sectional study. The scoring criteria were divided into 5 levels&#x2014;1 point=very poor, 2 points=poor, 3 points=acceptable, 4 points=good, and 5 points=very good. We take the average score of 5 experts, round the average score to the nearest whole number, and finally calculate classification statistics according to the score values. When asked questions in Chinese (the original language of the test paper), the most common scores were as follows (GPT-3.5 vs Bot-3.5): noun explanation (3 points vs 5 points) and short answer (3 points vs 4 points). When the test paper was translated into English for questions, the most points were scored (GPT-3.5 vs Bot-3.5): noun explanation (2 points vs 2 points) and short answer (3 points vs 4 points). The detailed frequency tables are shown in <xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of the distribution of scores stratified by question type (no explanation, short answer), language (English and Chinese), and artificial intelligence model (GPT-3.5 Turbo [GPT-3.5] and ERNIE Bot-3.5 [Bot-3.5]).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Categories and model</td><td align="left" valign="bottom">1 point</td><td align="left" valign="bottom">2 points</td><td align="left" valign="bottom">3 points</td><td align="left" valign="bottom">4 points</td><td align="left" valign="bottom">5 points</td><td align="left" valign="bottom">Total</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="8"><bold>Noun explanation (English)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">12</td><td align="left" valign="top">15</td><td align="left" valign="top">15</td><td align="left" valign="top">8</td><td align="left" valign="top">13</td><td align="left" valign="top">63</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">14</td><td align="left" valign="top">19</td><td align="left" valign="top">9</td><td align="left" valign="top">6</td><td align="left" valign="top">15</td><td align="left" valign="top">63</td></tr><tr><td align="left" valign="top" colspan="8"><bold>Noun explanation (Chinese)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">10</td><td align="left" valign="top">14</td><td align="left" valign="top">16</td><td align="left" valign="top">9</td><td align="left" valign="top">14</td><td align="left" valign="top">63</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">10</td><td align="left" valign="top">17</td><td align="left" valign="top">10</td><td align="left" valign="top">8</td><td align="left" valign="top">18</td><td align="left" valign="top">63</td></tr><tr><td align="left" valign="top" colspan="8"><bold>Short answer (English)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">9</td><td align="left" valign="top">15</td><td align="left" valign="top">25</td><td align="left" valign="top">14</td><td align="left" valign="top">6</td><td align="left" valign="top">69</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">7</td><td align="left" valign="top">11</td><td align="left" valign="top">22</td><td align="left" valign="top">18</td><td align="left" valign="top">11</td><td align="left" valign="top">69</td></tr><tr><td align="left" valign="top" colspan="8"><bold>Short answer (Chinese)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">8</td><td align="left" valign="top">15</td><td align="left" valign="top">23</td><td align="left" valign="top">16</td><td align="left" valign="top">7</td><td align="left" valign="top">69</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">5</td><td align="left" valign="top">12</td><td align="left" valign="top">19</td><td align="left" valign="top">20</td><td align="left" valign="top">13</td><td align="left" valign="top">69</td></tr></tbody></table></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The distribution of scores. Bot-3.5: ERNIE Bot-3.5; CHN: Chinese; Eng: English; GPT-3.5: GPT-3.5 Turbo; NE: noun explanation; SA: short answer.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e63924_fig02.png"/></fig><p>We conducted a quantitative statistical analysis of noun explanations and short answer scores, including minimum, maximum, IQR, median, mean, sum, and SD scores. We compared the quantitative statistics of the GPT-3.5 and Bot-3.5 scores in the Chinese language environment. For noun interpretation, the maximum, minimum, and median values were equal for GPT-3.5 versus Bot-3.5; other statistics were as follows: IQR 2&#x2010;4 vs 2&#x2010;5, mean 2.86 vs 3.37, sum 180 vs 212, and SD 1.4 vs 1.37. There was a significant difference between the 2 models in terms of the quality of answers to noun interpretation (<italic>P</italic>&#x003C;.05). For short answers, the maximum, minimum, and median values were equal (IQR 2&#x2010;4 vs 2.5&#x2010;4, mean 2.93 vs 3.36, sum 202 vs 232, and SD 1.15 vs 1.19), and there was a significant difference between the 2 models in terms of the quality of answers to short answers (<italic>P</italic>&#x003C;.05). The statistical results are shown in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Quantitative statistical results of noun interpretation and short answer scores for 2 artificial intelligence models (GPT-3.5 Turbo [GPT-3.5] and ERNIE Bot-3.5 [Bot-3.5]).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Categories</td><td align="left" valign="bottom" colspan="2">Noun explanations (n=63)</td><td align="left" valign="bottom" colspan="2">Short answers (n=69)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">Bot-3.5</td><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">Bot-3.5</td></tr></thead><tbody><tr><td align="left" valign="top">Minimum</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Maximum</td><td align="left" valign="top">5</td><td align="left" valign="top">5</td><td align="left" valign="top">5</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="top">IQR</td><td align="left" valign="top">2&#x2010;4</td><td align="left" valign="top">2&#x2010;5</td><td align="left" valign="top">2&#x2010;4</td><td align="left" valign="top">2.5&#x2010;4</td></tr><tr><td align="left" valign="top">Median</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Mean</td><td align="left" valign="top">2.86</td><td align="left" valign="top">3.37</td><td align="left" valign="top">2.93</td><td align="left" valign="top">3.36</td></tr><tr><td align="left" valign="top">Sum</td><td align="left" valign="top">180</td><td align="left" valign="top">212</td><td align="left" valign="top">202</td><td align="left" valign="top">232</td></tr><tr><td align="left" valign="top">SD</td><td align="left" valign="top">1.40</td><td align="left" valign="top">1.37</td><td align="left" valign="top">1.15</td><td align="left" valign="top">1.19</td></tr></tbody></table></table-wrap><p>To compare and analyze the performance of the 2 AI models (GPT-3.5 and Bot-3.5) in the Chinese language environment when noun explanations and short answers are classified according to different topics, we calculated the number of scores for each type of topic in the Chinese language environment. For noun interpretation, the highest percentage scores were obtained for GPT-3.5 vs Bot-3.5, basic knowledge (4 points vs 4 points), ultrasound examinations (5 points vs 5 points), disease and etiology (1 point vs 2 points), and ultrasound signs (1 point vs 1 point). For short answers, those with the highest percentage of points (3.5 points vs Bot-3.5), basic knowledge (3 points vs 5 points), ultrasound examinations (2 points vs 3 points), ultrasound diagnosis (3 points vs 5 points), disease and etiology (3 points vs 2 points), and ultrasound signs (3 points vs 3 points) were included. The statistical results are shown in Table 5.</p><p><xref ref-type="table" rid="table5">Table 5</xref> also shows the differences in scores between the 2 models on different subject categories. Based on the above results, the most important finding is that Bot-3.5 performs better than GPT-3.5 in the Chinese environment, both objectively and subjectively. Bot-3.5 is representative of the localization model, and GPT-3.5 is representative of the internationalization model. The excellent performance of Bot-3.5 is due to its comprehensiveness and depth in the Chinese training data set. The discovery of this performance difference has implications for both users and developers of chatbot systems, and it is necessary for users to have a deeper understanding of the model&#x2019;s data set background, language, and performance to better use them. Developers can also improve and optimize robot models according to performance differences.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>The score distributions of noun interpretations and short answers in different topic categories for 2 artificial intelligence models (GPT-3.5 Turbo [GPT-3.5] and ERNIE Bot-3.5 [Bot-3.5]).</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Categories</td><td align="left" valign="bottom" colspan="3">Noun explanations</td><td align="left" valign="bottom" colspan="3">Short answers</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Total</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Bot-3.5</td><td align="left" valign="top">Total</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Bot-3.5</td></tr></thead><tbody><tr><td align="left" valign="top">&#x2003;</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><bold>Basic knowledge</bold></td><td align="left" valign="top">21</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">13</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;1 point</td><td align="left" valign="top"/><td align="left" valign="top">5</td><td align="left" valign="top">3</td><td align="left" valign="top"/><td align="left" valign="top">3</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">&#x2003;2 points</td><td align="left" valign="top"/><td align="left" valign="top">4</td><td align="left" valign="top">0</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">&#x2003;3 points</td><td align="left" valign="top"/><td align="left" valign="top">4</td><td align="left" valign="top">4</td><td align="left" valign="top"/><td align="left" valign="top">4</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;4 points</td><td align="left" valign="top"/><td align="left" valign="top">5</td><td align="left" valign="top">8</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">&#x2003;5 points</td><td align="left" valign="top"/><td align="left" valign="top">3</td><td align="left" valign="top">6</td><td align="left" valign="top"/><td align="left" valign="top">3</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="top"><bold>Examinations</bold></td><td align="left" valign="top">8</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">10</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;1 point</td><td align="left" valign="top"/><td align="left" valign="top">0</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">&#x2003;2 points</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">4</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">&#x2003;3 points</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">&#x2003;4 points</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">&#x2003;5 points</td><td align="left" valign="top"/><td align="left" valign="top">3</td><td align="left" valign="top">4</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top"><bold>Diagnosis</bold></td><td align="left" valign="top">3</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">10</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;1 point</td><td align="left" valign="top"/><td align="left" valign="top">0</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;2 points</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">&#x2003;3 points</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">5</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;4 points</td><td align="left" valign="top"/><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;5 points</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">0</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top"><bold>Disease and etiology</bold></td><td align="left" valign="top">24</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">7</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;1 point</td><td align="left" valign="top"/><td align="left" valign="top">6</td><td align="left" valign="top">0</td><td align="left" valign="top"/><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">&#x2003;2 points</td><td align="left" valign="top"/><td align="left" valign="top">5</td><td align="left" valign="top">9</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">&#x2003;3 points</td><td align="left" valign="top"/><td align="left" valign="top">6</td><td align="left" valign="top">6</td><td align="left" valign="top"/><td align="left" valign="top">3</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">&#x2003;4 points</td><td align="left" valign="top"/><td align="left" valign="top">6</td><td align="left" valign="top">3</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;5 points</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">6</td><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top"><bold>Ultrasound signs</bold></td><td align="left" valign="top">7</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">29</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;1 point</td><td align="left" valign="top"/><td align="left" valign="top">3</td><td align="left" valign="top">2</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;2 points</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">2</td><td align="left" valign="top"/><td align="left" valign="top">8</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">&#x2003;3 points</td><td align="left" valign="top"/><td align="left" valign="top">0</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">10</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top">&#x2003;4 points</td><td align="left" valign="top"/><td align="left" valign="top">0</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">7</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top">&#x2003;5 points</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">3</td></tr></tbody></table></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Preliminary Findings</title><p>Since the public release of ChatGPT, its convenience has made AI more accessible than ever before. It has demonstrated its ability to provide answers even to knowledgeable and experienced professors, and its numerous advantages have quickly made it a hot topic and the focus of research. Many technology companies around the world are also developing chatbot systems [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Industries are also exploring how to integrate these AI technologies with their own work and learning to improve quality and efficiency [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. In the medical field, chatbot systems can be used as learning and consulting assistants to answer a variety of medical-related questions, but the accuracy and quality of their information must be carefully evaluated [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>].</p><p>In this cross-sectional study, we analyzed 554 actual examination paper questions from the field of ultrasound medicine and evaluated the performance of an AI chatbot system in answering these medical paper questions. Medical examination papers are often used to evaluate chatbot performance, mainly because of their wide coverage, representativeness, and reference answers [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]. Alessandri et al [<xref ref-type="bibr" rid="ref35">35</xref>] used the Residency Admission National Examination to evaluate ChatGPT. Humar et al [<xref ref-type="bibr" rid="ref22">22</xref>] evaluated ChatGPT with questions from the plastic surgery in-service examination. The main advantage of our study is that the questions are from actual ultrasound medical examinations, and its content is broad and representative. The question types included single-choice, multiple-choice, true or false questions; noun explanations; short answers; basic knowledge; ultrasound examination; ultrasound diagnosis; ultrasound case analysis; disease and etiology; and ultrasound signs. We conducted a quantitative comparative analysis of the performance of two of the most representative free chatbots (GPT-3.5 and Bot-3.5). Samaan et al [<xref ref-type="bibr" rid="ref16">16</xref>] demonstrated that chatbot performance is affected by the language environment. We use both English and Chinese to ask questions and analyze the quality of the answers provided by the 2 models in different linguistic environments.</p><p>Regardless of whether the input language is English or Chinese, the GPT-3.5 and Bot-3.5 models perform somewhat well on different types of questions in the field of ultrasound medicine. For objective questions (including single-choice, multipl-choice, and true or false questions in this cross-sectional study), we took the accuracy rate as the evaluation index, and a score of &#x2265;60% is acceptable, which is also the passing score of medical students. The best performance was true or false questions (accuracy rate: 60%-80%), followed by single-choice questions (accuracy rate: 57.34%-62.99%), and the worst performance was multiple-choice questions (accuracy rate: 8.33%-39.58%). For subjective questions (including noun explanations and short answers in this cross-sectional study &#x2265;3 points, considered to be acceptable answers, also the assessment criteria), the performance was better for short answers, with acceptable answers (&#x2265;3 points) accounting for 65.22%-75.36%, whereas the performance for noun explanations was lower, with acceptable answers accounting for 47.62%-61.9%.</p><p>The reasons behind these differences are worth exploring and analyzing. For objective questions, the difficulty and complexity of the questions may be important factors. Branum and Schiavenato [<xref ref-type="bibr" rid="ref14">14</xref>] reported that chatbots sometimes provide plausible but incorrect answers to complex clinical questions [<xref ref-type="bibr" rid="ref38">38</xref>]. True or false questions usually involve only the truth or falsity of a statement and are the least difficult. Although multiple options are provided in a single choice, the answer is unique, and the model needs to identify only the correct answer from a limited number of options, with moderate difficulty. The research results of Lai et al [<xref ref-type="bibr" rid="ref32">32</xref>] indicate that chatbots have high accuracy in single-choice selection. However, multiple choice requires the model to identify the correctness of multiple options at the same time, which significantly increases the difficulty and complexity of information processing. Mihalache et al [<xref ref-type="bibr" rid="ref3">3</xref>] demonstrated that substantial progress has not been made in multiple-choice chatbots. Furthermore, the processing of multiple-choice questions in the process of model training may be less common than that of other types of questions, and most of the multiple-choice answers are more flexible. For subjective questions, differences in openness, flexibility, and scoring criteria may be the reasons for differences in performance. Short answer questions often require the model to provide a short paragraph of explanation or description, which allows the model to show more flexibility and depth in its responses. On the other hand, noun interpretation is more focused on the precise interpretation of specific terms, which requires higher accuracy and professionalism of the model. When short answers are graded, the grading criteria may be more flexible, allowing a degree of freedom. Other studies have asked questions about diseases similar to the short-answer questions in this study. Yun et al [<xref ref-type="bibr" rid="ref33">33</xref>] demonstrated the effectiveness of chatbots when short-answer questions are answered. Noun interpretation, on the other hand, can be more rigorous, requiring the model to provide a precise and professional interpretation [<xref ref-type="bibr" rid="ref39">39</xref>]. Therefore, the differences in the performance of the models on different question types can be attributed to the differences in the difficulty of the questions, the complexity of information processing, the training data, and the scoring criteria. As a user, it is necessary to understand the performance differences of models of different types in advance, and as a developer, adjusting the distribution of the training data set and optimizing the model are important.</p><p>When comparing the performance of the GPT-3.5 and Bot-3.5 AI models in the field of ultrasound medicine, we found some significant differences. <xref ref-type="table" rid="table2">Table 2</xref> reported that the accuracy rate of GPT-3.5 for multiple-choice and true or false questions is lower than that of Bot-3.5, suggesting that Bot-3.5 may be more capable of handling problems that require deep understanding and reasoning. Tang and Yang [<xref ref-type="bibr" rid="ref23">23</xref>] demonstrated that Bot-3.5 shows greater participation and enthusiasm in teaching applications. This difference may be related to the differences in training data and algorithm architecture between the 2 models. Bot-3.5 may have been exposed to more diverse and complex data from the medical field during training to be better able to handle these types of problems. From <xref ref-type="table" rid="table4">Tables 4</xref>,<xref ref-type="table" rid="table5">5</xref> and <xref ref-type="table" rid="table5">5</xref>, we further analyze the scores of the 2 models on different question types. In terms of noun interpretation and short answers, Bot-3.5 scores are generally higher than GPT-3.5 scores, especially in the Chinese context. This result suggested that Bot-3.5 may have greater accuracy in understanding and interpreting medical terms and concepts. In addition, from the comparison of the score statistics, we can observe that Bot-3.5 is more concentrated in the score distribution, which implies that it has greater stability and consistency in dealing with similar problems.</p><p>When the test paper was translated into English for questioning, the performance of both models declined, but Bot-3.5 maintained its advantage in terms of multiple-choice and true or false questions. This finding reveals the limitations of both models in cross-language processing. Zhu et al [<xref ref-type="bibr" rid="ref25">25</xref>] compared the performance of large language models developed in different countries, highlighting the necessity for localized models. Although both models claim to support multilingual processing, in practice, the model&#x2019;s ability to process across languages is affected by various aspects, such as architecture, algorithms, or training data, and when the model switches from 1 language to another, it may need to readjust to new language features. This adaptation process can lead to performance degradation, especially when dealing with complex tasks. During the training of the model, there may be differences in the distribution and scale of the Chinese and English data, and if the model is trained more fully on the Chinese data, it may perform better on the Chinese input. Bot-3.5 was developed and trained in China and has a large Chinese corpus. During the design and training process, Chinese language features were deeply optimized. This optimization allows Bot-3.5 to perform well in terms of Chinese semantic understanding and context grasp.</p><p>Biswas et al [<xref ref-type="bibr" rid="ref26">26</xref>] conducted a categorical assessment analysis of these problems. In terms of noun interpretation and short answer questions, both models scored highly in basic knowledge, ultrasonography, disease, and cause, which may be related to the importance of these topics in the medical field and their richness in the training data. However, in areas such as ultrasound signs and ultrasound diagnosis, both models scored relatively low, which may reflect the complexity and challenge of these topics in the medical field, requiring more in-depth reasoning and understanding of complex issues [<xref ref-type="bibr" rid="ref40">40</xref>].</p><p>Although chatbots are not yet able to provide perfect answers in all aspects of medicine, chatbots have great potential for answering medical questions. Because the performance of chatbots is not static, with the continuous enrichment of training data and the continuous improvement and optimization of algorithms, chatbot performance will continue to improve. The results of this cross-sectional study have implications for both users and developers of chatbot systems. To improve the quality and efficiency of chatbot use, users need to deeply understand the performance and characteristics of different models and carefully study the performance differences and evaluation results of different models on different topics or task types to select the appropriate model according to actual needs [<xref ref-type="bibr" rid="ref24">24</xref>]. Moreover, users should set clear expectations and establish appropriate evaluation criteria based on the model's actual performance to conduct objective and comprehensive evaluations in practical applications. For complex tasks or tasks requiring high accuracy, users can attempt to combine the advantages of multiple models, adopt the method of ensemble learning, and weight the prediction results of multiple models to obtain more accurate prediction results. Specific feedback and suggestions are provided to developers in appropriate ways and channels when the model is used. For developers, the results of the study can be used to optimize the model for performance issues on a particular topic or task. For example, training data in related fields can be increased, or more advanced training algorithms can be used to improve the performance of the model in the medical field. The user feedback and suggestions are actively collected, and the model is constantly iterated and improved. Everyone needs to recognize that no chatbot is omnipotent. We need to understand it, use it correctly rather than contradict or abuse it, treat it as a judge, let it grow healthily, and create more and better value for human beings.</p></sec><sec id="s4-2"><title>Limitations</title><p>Our study has several limitations. First, the original language of all the questions in this cross-sectional study was Chinese, and the questions were asked in Chinese or translated from Chinese to English, which may not represent the performance of the model in other languages. Second, the sample size of our study was not large and focused only on text-related topics related to ultrasound medicine. Experts also have potential bias in the scoring. The universality of the results of this study in other medical specialties needs further research. Finally, chatbot versions are constantly being updated, and the results of this study may not be representative of the performance of the most recent version of the model at the time of publication [<xref ref-type="bibr" rid="ref41">41</xref>]. However, GPT-3.5 and Bot-3.5 are the best-performing versions of the current free versions, as well as the largest number of users, and it is meaningful to study the performance comparison of the latest free versions of each model over the same period.</p></sec><sec id="s4-3"><title>Conclusions</title><p>The results show that the AI chatbots represented by GPT-3.5 have a certain ability to answer questions in ultrasound medical examination papers, but there are varying degrees of performance differences across chatbot models, input languages, question types. and topics. For the Chinese ultrasound medical questions in this study, Bot-3.5 was superior to GPT-3.5 in terms of both accuracy and quality in many aspects. These findings suggest that users need to thoroughly understand the performance characteristics of various models and their applicability to different types of problems. For complex problems, multiple models are needed for comprehensive analysis. This finding also suggests that developers need to continuously optimize models to enrich training data, especially in medical specialties. In this way, chatbots can be continuously optimized, their performance consistently improved, and their ability to provide high-quality services enhanced.</p></sec></sec></body><back><ack><p>The authors would like to acknowledge the assistance provided by Dr Lu from West China Hospital for data collection and validation. This research was supported by the 1.3.5 project for disciplines of excellence, West China Hospital, Sichuan University.</p></ack><notes><sec><title>Data Availability</title><p>The data presented in this study are available upon request from the corresponding author. The data are not publicly available due to privacy restrictions.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">Bot-3.5</term><def><p>ERNIE Bot-3.5</p></def></def-item><def-item><term id="abb3">GPT-3.5</term><def><p>GPT-3.5 Turbo</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deiana</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dettori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Arghittu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Azara</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gabutti</surname><given-names>G</given-names> </name><name name-style="western"><surname>Castiglia</surname><given-names>P</given-names> </name></person-group><article-title>Artificial intelligence and public health: evaluating ChatGPT responses to vaccination myths and misconceptions</article-title><source>Vaccines (Basel)</source><year>2023</year><month>07</month><day>7</day><volume>11</volume><issue>7</issue><fpage>1217</fpage><pub-id pub-id-type="doi">10.3390/vaccines11071217</pub-id><pub-id pub-id-type="medline">37515033</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hoch</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Wollenberg</surname><given-names>B</given-names> </name><name name-style="western"><surname>L&#x00FC;ers</surname><given-names>JC</given-names> </name><etal/></person-group><article-title>ChatGPT&#x2019;s quiz skills in different otolaryngology subspecialties: an analysis of 2576 single-choice and multiple-choice board certification preparation questions</article-title><source>Eur Arch Otorhinolaryngol</source><year>2023</year><month>09</month><volume>280</volume><issue>9</issue><fpage>4271</fpage><lpage>4278</lpage><pub-id pub-id-type="doi">10.1007/s00405-023-08051-4</pub-id><pub-id pub-id-type="medline">37285018</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mihalache</surname><given-names>A</given-names> </name><name name-style="western"><surname>Popovic</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Muni</surname><given-names>RH</given-names> </name></person-group><article-title>Performance of an artificial intelligence chatbot in ophthalmic knowledge assessment</article-title><source>JAMA Ophthalmol</source><year>2023</year><month>06</month><day>1</day><volume>141</volume><issue>6</issue><fpage>589</fpage><lpage>597</lpage><pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.1144</pub-id><pub-id pub-id-type="medline">37103928</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cakir</surname><given-names>H</given-names> </name><name name-style="western"><surname>Caglar</surname><given-names>U</given-names> </name><name name-style="western"><surname>Yildiz</surname><given-names>O</given-names> </name><name name-style="western"><surname>Meric</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ayranci</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ozgor</surname><given-names>F</given-names> </name></person-group><article-title>Evaluating the performance of ChatGPT in answering questions related to urolithiasis</article-title><source>Int Urol Nephrol</source><year>2024</year><month>01</month><volume>56</volume><issue>1</issue><fpage>17</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1007/s11255-023-03773-0</pub-id><pub-id pub-id-type="medline">37658948</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luykx</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Gerritse</surname><given-names>F</given-names> </name><name name-style="western"><surname>Habets</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Vinkers</surname><given-names>CH</given-names> </name></person-group><article-title>The performance of ChatGPT in generating answers to clinical questions in psychiatry: a two-layer assessment</article-title><source>World Psychiatry</source><year>2023</year><month>10</month><volume>22</volume><issue>3</issue><fpage>479</fpage><lpage>480</lpage><pub-id pub-id-type="doi">10.1002/wps.21145</pub-id><pub-id pub-id-type="medline">37713576</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Samaan</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Yeo</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Rajeev</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Assessing the accuracy of responses by the language model ChatGPT to questions regarding bariatric surgery</article-title><source>Obes Surg</source><year>2023</year><month>06</month><volume>33</volume><issue>6</issue><fpage>1790</fpage><lpage>1796</lpage><pub-id pub-id-type="doi">10.1007/s11695-023-06603-5</pub-id><pub-id pub-id-type="medline">37106269</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tariq</surname><given-names>R</given-names> </name><name name-style="western"><surname>Malik</surname><given-names>S</given-names> </name><name name-style="western"><surname>Khanna</surname><given-names>S</given-names> </name></person-group><article-title>Evolving landscape of large language models: an evaluation of ChatGPT and Bard in answering patient queries on colonoscopy</article-title><source>Gastroenterology</source><year>2024</year><month>01</month><volume>166</volume><issue>1</issue><fpage>220</fpage><lpage>221</lpage><pub-id pub-id-type="doi">10.1053/j.gastro.2023.08.033</pub-id><pub-id pub-id-type="medline">37634736</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Danesh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pazouki</surname><given-names>H</given-names> </name><name name-style="western"><surname>Danesh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Danesh</surname><given-names>F</given-names> </name><name name-style="western"><surname>Danesh</surname><given-names>A</given-names> </name></person-group><article-title>The performance of artificial intelligence language models in board-style dental knowledge assessment: a preliminary study on ChatGPT</article-title><source>J Am Dent Assoc</source><year>2023</year><month>11</month><volume>154</volume><issue>11</issue><fpage>970</fpage><lpage>974</lpage><pub-id pub-id-type="doi">10.1016/j.adaj.2023.07.016</pub-id><pub-id pub-id-type="medline">37676187</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walker</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Ghani</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kuemmerli</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Reliability of medical information provided by ChatGPT: assessment against clinical guidelines and patient Information quality instrument</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>30</day><volume>25</volume><fpage>e47479</fpage><pub-id pub-id-type="doi">10.2196/47479</pub-id><pub-id pub-id-type="medline">37389908</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chervenak</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lieman</surname><given-names>H</given-names> </name><name name-style="western"><surname>Blanco-Breindel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jindal</surname><given-names>S</given-names> </name></person-group><article-title>The promise and peril of using a large language model to obtain clinical information: ChatGPT performs strongly as a fertility counseling tool with limitations</article-title><source>Fertil Steril</source><year>2023</year><month>09</month><volume>120</volume><issue>3 Pt 2</issue><fpage>575</fpage><lpage>583</lpage><pub-id pub-id-type="doi">10.1016/j.fertnstert.2023.05.151</pub-id><pub-id pub-id-type="medline">37217092</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kao</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Chien</surname><given-names>TW</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Chou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Chow</surname><given-names>JC</given-names> </name></person-group><article-title>Assessing ChatGPT&#x2019;s capacity for clinical decision support in pediatrics: A comparative study with pediatricians using KIDMAP of Rasch analysis</article-title><source>Medicine (Balt)</source><year>2023</year><month>06</month><day>23</day><volume>102</volume><issue>25</issue><fpage>e34068</fpage><pub-id pub-id-type="doi">10.1097/MD.0000000000034068</pub-id><pub-id pub-id-type="medline">37352054</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cocci</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pezzoli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lo Re</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Quality of information and appropriateness of ChatGPT outputs for urology patients</article-title><source>Prostate Cancer Prostatic Dis</source><year>2024</year><month>03</month><volume>27</volume><issue>1</issue><fpage>103</fpage><lpage>108</lpage><pub-id pub-id-type="doi">10.1038/s41391-023-00705-y</pub-id><pub-id pub-id-type="medline">37516804</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yeo</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Samaan</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>WH</given-names> </name><etal/></person-group><article-title>Assessing the performance of ChatGPT in answering questions regarding cirrhosis and hepatocellular carcinoma</article-title><source>Clin Mol Hepatol</source><year>2023</year><month>07</month><volume>29</volume><issue>3</issue><fpage>721</fpage><lpage>732</lpage><pub-id pub-id-type="doi">10.3350/cmh.2023.0089</pub-id><pub-id pub-id-type="medline">36946005</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Branum</surname><given-names>C</given-names> </name><name name-style="western"><surname>Schiavenato</surname><given-names>M</given-names> </name></person-group><article-title>Can ChatGPT accurately answer a PICOT question? Assessing AI response to a clinical question</article-title><source>Nurse Educ</source><year>2023</year><volume>48</volume><issue>5</issue><fpage>231</fpage><lpage>233</lpage><pub-id pub-id-type="doi">10.1097/NNE.0000000000001436</pub-id><pub-id pub-id-type="medline">37130197</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gencer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Aydin</surname><given-names>S</given-names> </name></person-group><article-title>Can ChatGPT pass the thoracic surgery exam?</article-title><source>Am J Med Sci</source><year>2023</year><month>10</month><volume>366</volume><issue>4</issue><fpage>291</fpage><lpage>295</lpage><pub-id pub-id-type="doi">10.1016/j.amjms.2023.08.001</pub-id><pub-id pub-id-type="medline">37549788</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Samaan</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Yeo</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>WH</given-names> </name><etal/></person-group><article-title>ChatGPT&#x2019;s ability to comprehend and answer cirrhosis related questions in Arabic</article-title><source>Arab J Gastroenterol</source><year>2023</year><month>08</month><volume>24</volume><issue>3</issue><fpage>145</fpage><lpage>148</lpage><pub-id pub-id-type="doi">10.1016/j.ajg.2023.08.001</pub-id><pub-id pub-id-type="medline">37673708</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alter</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lessans</surname><given-names>N</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>R</given-names> </name><name name-style="western"><surname>Brezinov</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Levin</surname><given-names>G</given-names> </name></person-group><article-title>Performance of ChatGPT in Israeli Hebrew OBGYN national residency examinations</article-title><source>Arch Gynecol Obstet</source><year>2023</year><month>12</month><volume>308</volume><issue>6</issue><fpage>1797</fpage><lpage>1802</lpage><pub-id pub-id-type="doi">10.1007/s00404-023-07185-4</pub-id><pub-id pub-id-type="medline">37668790</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fergus</surname><given-names>S</given-names> </name><name name-style="western"><surname>Botha</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ostovar</surname><given-names>M</given-names> </name></person-group><article-title>Evaluating academic answers generated using ChatGPT</article-title><source>J Chem Educ</source><year>2023</year><month>04</month><day>11</day><volume>100</volume><issue>4</issue><fpage>1672</fpage><lpage>1675</lpage><pub-id pub-id-type="doi">10.1021/acs.jchemed.3c00087</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Friederichs</surname><given-names>H</given-names> </name><name name-style="western"><surname>Friederichs</surname><given-names>WJ</given-names> </name><name name-style="western"><surname>M&#x00E4;rz</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT in medical school: how successful is AI in progress testing?</article-title><source>Med Educ Online</source><year>2023</year><month>12</month><volume>28</volume><issue>1</issue><fpage>2220920</fpage><pub-id pub-id-type="doi">10.1080/10872981.2023.2220920</pub-id><pub-id pub-id-type="medline">37307503</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aburumman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Al Annan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mrad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Brunaldi</surname><given-names>VO</given-names> </name><name name-style="western"><surname>Gala</surname><given-names>K</given-names> </name><name name-style="western"><surname>Abu Dayyeh</surname><given-names>BK</given-names> </name></person-group><article-title>Assessing ChatGPT vs. standard medical resources for endoscopic sleeve gastroplasty education: a medical professional evaluation study</article-title><source>Obes Surg</source><year>2024</year><month>07</month><volume>34</volume><issue>7</issue><fpage>2718</fpage><lpage>2724</lpage><pub-id pub-id-type="doi">10.1007/s11695-024-07283-5</pub-id><pub-id pub-id-type="medline">38758515</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deebel</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Terlecki</surname><given-names>R</given-names> </name></person-group><article-title>ChatGPT performance on the American Urological Association Self-Assessment Study Program and the potential influence of artificial intelligence in urologic training</article-title><source>Urology</source><year>2023</year><month>07</month><volume>177</volume><fpage>29</fpage><lpage>33</lpage><pub-id pub-id-type="doi">10.1016/j.urology.2023.05.010</pub-id><pub-id pub-id-type="medline">37209880</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Humar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Asaad</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bengur</surname><given-names>FB</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>V</given-names> </name></person-group><article-title>ChatGPT is equivalent to first-year plastic surgery residents: evaluation of ChatGPT on the plastic surgery in-service examination</article-title><source>Aesthet Surg J</source><year>2023</year><month>11</month><day>16</day><volume>43</volume><issue>12</issue><fpage>1085</fpage><lpage>NP1089</lpage><pub-id pub-id-type="doi">10.1093/asj/sjad130</pub-id><pub-id pub-id-type="medline">37140001</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name></person-group><article-title>Applying large language models in teaching business English writing: a case study of business proposal writing</article-title><source>SHS Web Conf</source><year>2024</year><volume>181</volume><fpage>01052</fpage><pub-id pub-id-type="doi">10.1051/shsconf/202418101052</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>Q</given-names> </name></person-group><article-title>Physician versus large language model chatbot responses to web-based questions from autistic patients in Chinese: cross-sectional comparative analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>04</month><day>30</day><volume>26</volume><fpage>e54706</fpage><pub-id pub-id-type="doi">10.2196/54706</pub-id><pub-id pub-id-type="medline">38687566</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>P</given-names> </name></person-group><article-title>Language and cultural bias in AI: comparing the performance of large language models developed in different countries on Traditional Chinese Medicine highlights the need for localized models</article-title><source>J Transl Med</source><year>2024</year><month>03</month><day>29</day><volume>22</volume><issue>1</issue><fpage>319</fpage><pub-id pub-id-type="doi">10.1186/s12967-024-05128-4</pub-id><pub-id pub-id-type="medline">38553705</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biswas</surname><given-names>S</given-names> </name><name name-style="western"><surname>Logan</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Davies</surname><given-names>LN</given-names> </name><name name-style="western"><surname>Sheppard</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Wolffsohn</surname><given-names>JS</given-names> </name></person-group><article-title>Assessing the utility of ChatGPT as an artificial intelligence&#x2010;based large language model for information to answer questions on myopia</article-title><source>Ophthalmic Physiol Opt</source><year>2023</year><month>11</month><volume>43</volume><issue>6</issue><fpage>1562</fpage><lpage>1570</lpage><pub-id pub-id-type="doi">10.1111/opo.13207</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madrid-Garc&#x00ED;a</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rosales-Rosado</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Freites-Nu&#x00F1;ez</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Harnessing ChatGPT and GPT-4 for evaluating the rheumatology questions of the Spanish access exam to specialized medical training</article-title><source>Sci Rep</source><year>2023</year><volume>13</volume><issue>1</issue><fpage>22129</fpage><pub-id pub-id-type="doi">10.1101/2023.07.21.23292821</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Potapenko</surname><given-names>I</given-names> </name><name name-style="western"><surname>Malmqvist</surname><given-names>L</given-names> </name><name name-style="western"><surname>Subhi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hamann</surname><given-names>S</given-names> </name></person-group><article-title>Artificial intelligence-based ChatGPT responses for patient questions on Optic Disc Drusen</article-title><source>Ophthalmol Ther</source><year>2023</year><month>12</month><volume>12</volume><issue>6</issue><fpage>3109</fpage><lpage>3119</lpage><pub-id pub-id-type="doi">10.1007/s40123-023-00800-2</pub-id><pub-id pub-id-type="medline">37698823</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haver</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Ambinder</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Bahl</surname><given-names>M</given-names> </name><name name-style="western"><surname>Oluyemi</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Jeudy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>PH</given-names> </name></person-group><article-title>Appropriateness of breast cancer prevention and screening recommendations provided by ChatGPT</article-title><source>Radiology</source><year>2023</year><month>05</month><volume>307</volume><issue>4</issue><fpage>e230424</fpage><pub-id pub-id-type="doi">10.1148/radiol.230424</pub-id><pub-id pub-id-type="medline">37014239</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kusunose</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kashima</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sata</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of the accuracy of ChatGPT in answering clinical questions on the Japanese Society of Hypertension Guidelines</article-title><source>Circ J</source><year>2023</year><month>06</month><day>23</day><volume>87</volume><issue>7</issue><fpage>1030</fpage><lpage>1033</lpage><pub-id pub-id-type="doi">10.1253/circj.CJ-23-0308</pub-id><pub-id pub-id-type="medline">37286486</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hong</surname><given-names>D</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>L</given-names> </name></person-group><article-title>ChatGPT&#x2019;s responses to gout-related questions</article-title><source>Asian J Surg</source><year>2023</year><month>12</month><volume>46</volume><issue>12</issue><fpage>5935</fpage><lpage>5936</lpage><pub-id pub-id-type="doi">10.1016/j.asjsur.2023.08.217</pub-id><pub-id pub-id-type="medline">37696697</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lai</surname><given-names>UH</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Hsu</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Kan</surname><given-names>JKC</given-names> </name></person-group><article-title>Evaluating the performance of ChatGPT-4 on the United Kingdom Medical Licensing Assessment</article-title><source>Front Med</source><year>2023</year><volume>10</volume><fpage>1240915</fpage><pub-id pub-id-type="doi">10.3389/fmed.2023.1240915</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yun</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>EK</given-names> </name></person-group><article-title>A comprehensive evaluation of ChatGPT consultation quality for augmentation mammoplasty: A comparative analysis between plastic surgeons and laypersons</article-title><source>Int J Med Inform</source><year>2023</year><month>11</month><volume>179</volume><fpage>105219</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105219</pub-id><pub-id pub-id-type="medline">37776670</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Das</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>N</given-names> </name><name name-style="western"><surname>Longjam</surname><given-names>LA</given-names> </name><etal/></person-group><article-title>Assessing the capability of ChatGPT in answering first- and second-order knowledge questions on microbiology as per competency-based medical education curriculum</article-title><source>Cureus</source><year>2023</year><month>03</month><volume>15</volume><issue>3</issue><fpage>e36034</fpage><pub-id pub-id-type="doi">10.7759/cureus.36034</pub-id><pub-id pub-id-type="medline">37056538</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alessandri Bonetti</surname><given-names>M</given-names> </name><name name-style="western"><surname>Giorgino</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gallo Afflitto</surname><given-names>G</given-names> </name><name name-style="western"><surname>De Lorenzi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Egro</surname><given-names>FM</given-names> </name></person-group><article-title>How does ChatGPT perform on the Italian Residency Admission National Exam compared to 15,869 medical graduates?</article-title><source>Ann Biomed Eng</source><year>2024</year><month>04</month><volume>52</volume><issue>4</issue><fpage>745</fpage><lpage>749</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03318-7</pub-id><pub-id pub-id-type="medline">37490183</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gupta</surname><given-names>R</given-names> </name><name name-style="western"><surname>Herzog</surname><given-names>I</given-names> </name><name name-style="western"><surname>Park</surname><given-names>JB</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on the Plastic Surgery Inservice Training Examination</article-title><source>Aesthet Surg J</source><year>2023</year><month>11</month><day>16</day><volume>43</volume><issue>12</issue><fpage>1078</fpage><lpage>NP1082</lpage><pub-id pub-id-type="doi">10.1093/asj/sjad128</pub-id><pub-id pub-id-type="medline">37128784</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oztermeli</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Oztermeli</surname><given-names>A</given-names> </name></person-group><article-title>ChatGPT performance in the medical specialty exam: An observational study</article-title><source>Medicine (Baltimore)</source><year>2023</year><month>08</month><day>11</day><volume>102</volume><issue>32</issue><fpage>e34673</fpage><pub-id pub-id-type="doi">10.1097/MD.0000000000034673</pub-id><pub-id pub-id-type="medline">37565917</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Javid</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bhandari</surname><given-names>M</given-names> </name><name name-style="western"><surname>Parameshwari</surname><given-names>P</given-names> </name><name name-style="western"><surname>Reddiboina</surname><given-names>M</given-names> </name><name name-style="western"><surname>Prasad</surname><given-names>S</given-names> </name></person-group><article-title>Evaluation of ChatGPT for patient counseling in kidney stone clinic: a prospective study</article-title><source>J Endourol</source><year>2024</year><month>04</month><volume>38</volume><issue>4</issue><fpage>377</fpage><lpage>383</lpage><pub-id pub-id-type="doi">10.1089/end.2023.0571</pub-id><pub-id pub-id-type="medline">38411835</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Laukamp</surname><given-names>KR</given-names> </name><name name-style="western"><surname>Terzis</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Werner</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Monitoring patients with glioblastoma by using a large language model: accurate summarization of radiology reports with GPT-4</article-title><source>Radiology</source><year>2024</year><month>07</month><volume>312</volume><issue>1</issue><fpage>e232640</fpage><pub-id pub-id-type="doi">10.1148/radiol.232640</pub-id><pub-id pub-id-type="medline">39041936</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tian Tran</surname><given-names>J</given-names> </name><name name-style="western"><surname>Burghall</surname><given-names>A</given-names> </name><name name-style="western"><surname>Blydt-Hansen</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Exploring the ability of ChatGPT to create quality patient education resources about kidney transplant</article-title><source>Patient Educ Couns</source><year>2024</year><month>12</month><volume>129</volume><fpage>108400</fpage><pub-id pub-id-type="doi">10.1016/j.pec.2024.108400</pub-id><pub-id pub-id-type="medline">39232336</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments</article-title><source>Sci Rep</source><year>2023</year><month>10</month><day>1</day><volume>13</volume><issue>1</issue><fpage>16492</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-43436-9</pub-id><pub-id pub-id-type="medline">37779171</pub-id></nlm-citation></ref></ref-list></back></article>