<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e68527</article-id><article-id pub-id-type="doi">10.2196/68527</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>The Advanced Reasoning Capabilities of Large Language Models for Detecting Contraindicated Options in Medical Exams</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Yano</surname><given-names>Yuichiro</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ohashi</surname><given-names>Mizuki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Miyagami</surname><given-names>Taiju</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mori</surname><given-names>Hirotake</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nishizaki</surname><given-names>Yuji</given-names></name><degrees>MD, MPH, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Daida</surname><given-names>Hiroyuki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Naito</surname><given-names>Toshio</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of General Medicine, Juntendo University Faculty of Medicine</institution><addr-line>2-1-1, Hongo, Bunkyo-Ku</addr-line><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff2"><institution>AI Incubation Farm, Juntendo University Faculty of Medicine</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Division of Medical Education, Juntendo University School of Medicine</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff4"><institution>Department of Cardiovascular Biology and Medicine, Juntendo University Graduate School of Medicine</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Castonguay</surname><given-names>Alexandre</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Ma</surname><given-names>Chunwei</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Azahar</surname><given-names>Nazar</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yuichiro Yano, MD, PhD, Department of General Medicine, Juntendo University Faculty of Medicine, 2-1-1, Hongo, Bunkyo-Ku, Tokyo, 113-8421, Japan, 81 3-3813-3111; <email>yano.yuichiro@jichi.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>12</day><month>5</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e68527</elocation-id><history><date date-type="received"><day>09</day><month>11</month><year>2024</year></date><date date-type="rev-recd"><day>22</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>25</day><month>03</month><year>2025</year></date></history><copyright-statement>&#x00A9; Yuichiro Yano, Mizuki Ohashi, Taiju Miyagami, Hirotake Mori, Yuji Nishizaki, Hiroyuki Daida, Toshio Naito. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 12.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e68527"/><abstract><p>Enhancing clinical reasoning and reducing diagnostic errors are essential in medical practice; OpenAI-o1, with advanced reasoning capabilities, performed better than GPT-4 on 15 Japanese National Medical Licensing Examination questions (accuracy: 100% vs 80%; contraindicated option detection: 87% vs 73%), though findings are preliminary due to the small sample size.</p></abstract><kwd-group><kwd>natural language processing</kwd><kwd>artificial intelligence</kwd><kwd>clinical reasoning</kwd><kwd>medical errors</kwd><kwd>large language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Diagnostic errors account for more than 8% of adverse medical events and up to 30% of malpractice claims [<xref ref-type="bibr" rid="ref1">1</xref>]. Enhancing clinical reasoning could mitigate this [<xref ref-type="bibr" rid="ref2">2</xref>], improving patient outcomes and potentially lowering legal liabilities. In September 2024, OpenAI introduced OpenAI-o1, a large language model (LLM) trained with reinforcement learning to enhance its complex &#x201C;reasoning&#x201D; [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Key enhancements include advanced attention mechanisms, refined training data and curation, and enhanced fine-tuning protocols [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, it remains uncertain whether OpenAI-o1 can improve clinical reasoning and reduce diagnostic errors.</p><p>In the Japanese National Medical Licensing Examination (JNMLE), candidates must not only achieve high overall accuracy but also avoid selecting contraindicated options&#x2014;errors that can lead to failure even if most answers are correct. Although prior studies indicate that ChatGPT-4 performs well on the JNMLE, it sometimes chooses contraindicated options [<xref ref-type="bibr" rid="ref6">6</xref>]. We posited that OpenAI-o1 would exhibit superior reasoning compared to GPT-4 and hypothesized that it would more proficiently avoid contraindicated options.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>On October 10, 2024, we used 15 text-based JNMLE questions (from 2019 to 2024) that included contraindicated options (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Questions with images were excluded due to OpenAI-o1&#x2019;s inability to process visual data. We administered the questions to both GPT-4 and OpenAI-o1, with each model evaluated under the supervision of designated examiners (MO and TM).</p><p>The examination comprised 3 steps: (1) Japanese examination&#x2014;select correct answers, (2) Japanese examination&#x2014;identify contraindicated options, and (3) English examination&#x2014;repeat steps 1 and 2 with translated questions. Translation used an automated system and was reviewed by bilingual clinical expert YY.</p><p>The responses from both models were recorded, and the results were evaluated based on the numbers of correct answers and correctly identified contraindicated options in both languages.</p></sec><sec id="s3" sec-type="results"><title>Results</title><p>As shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, among the 15 questions, GPT-4 correctly answered 12 (80%) and identified 11 contraindicated options (73%) in Japanese. In English, GPT-4 correctly answered 13 questions (87%) and identified 11 contraindicated options (73%). OpenAI-o1 correctly answered 15 questions (100%) and identified 13 contraindicated options in Japanese (87%). Both GPT-4 and OpenAI-o1 had consistently equal or better performance in English than Japanese, especially for contraindicated options.</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>OpenAI-o1 had higher accuracy than GPT-4 and was better able to select contraindicated options on the JNMLE, particularly in English. However, this difference was minimal&#x2014;only 1 of 15 questions showed improvement in English&#x2014;indicating that language had little overall impact.</p><p>In medicine, avoiding contraindicated actions is crucial. While correct answers reflect basic medical knowledge, recognizing what should not be done requires advanced critical thinking and reasoning. Errors can lead to patient harm, lawsuits, or even license revocation. Here, OpenAI-o1 outperformed GPT-4 in identifying contraindicated actions. OpenAI-o1&#x2019;s enhancements [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>] and our finding of its superior reasoning ability suggest the importance of using LLMs with robust reasoning capabilities for medical licensing examinations and, by extension, in clinical practice, to safeguard patient safety and uphold high standards of care.</p><p>Our study is limited, first, by using only 15 questions, so these findings should be interpreted as preliminary and hypothesis-generating. Second, we used the models&#x2019; default settings without fine-tuning, prompt engineering, or chain-of-thought modifications, capturing their performance at only a specific time point. Third, we obtained a single response per query, which may not reflect the full variability of LLM outputs. Fourth, continuous model updates limit exact reproducibility. Fifth, only 2 of 15 questions showed discrepancies, limiting our ability to analyze performance trends across question types (eg, clinical scenarios, complexity, and format). Sixth, we focused on comparing OpenAI-o1 and GPT-4 and excluded human performance benchmarks (eg, from medical students) due to the study&#x2019;s rapid initiation in October 2024, immediately following the release of OpenAI-o1. Given GPT-4&#x2019;s extensive dataset training and OpenAI-o1&#x2019;s enhanced reasoning capabilities, our primary objective was to promptly assess their differences in a medical context; frequent updates to LLMs and the time required for ethics approval and participant recruitment precluded human comparisons. Future research should integrate such comparisons. Lastly, we did not statistically evaluate the significance of the observed performance differences, further limiting our findings&#x2019; interpretability. The &#x201C;black box&#x201D; nature of both OpenAI-o1 and GPT-4 also limits interpretability; future research should use methods like attention analysis and causal reasoning tests and compare these models with open-source alternatives (eg, DeepSeek, Qwen) to enhance reproducibility and transparency.</p><p>The improved reasoning abilities of OpenAI-o1 may hold promise for real-world clinical applications. However, these findings are preliminary, and further research is needed to determine whether integrating such models into decision-support systems can contribute to reducing errors and enhancing patient care.</p></sec></body><back><ack><p>This research was partially funded by the Advanced Medical Personnel Training Program (principal investigator: TN) and was supported by the Ministry of Education, Culture, Sports, Science, and Technology.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">JNMLE</term><def><p>Japanese National Medical Licensing Examination</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Berner</surname><given-names>ES</given-names> </name><name name-style="western"><surname>Graber</surname><given-names>ML</given-names> </name></person-group><article-title>Overconfidence as a cause of diagnostic error in medicine</article-title><source>Am J Med</source><year>2008</year><month>05</month><volume>121</volume><issue>5 Suppl</issue><fpage>S2</fpage><lpage>S23</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2008.01.001</pub-id><pub-id pub-id-type="medline">18440350</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bowen</surname><given-names>JL</given-names> </name></person-group><article-title>Educational strategies to promote clinical diagnostic reasoning</article-title><source>N Engl J Med</source><year>2006</year><month>11</month><day>23</day><volume>355</volume><issue>21</issue><fpage>2217</fpage><lpage>2225</lpage><pub-id pub-id-type="doi">10.1056/NEJMra054782</pub-id><pub-id pub-id-type="medline">17124019</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>Learning to reason with LLMs</article-title><source>OpenAI</source><year>2024</year><month>09</month><day>12</day><access-date>2025-03-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/learning-to-reason-with-llms/">https://openai.com/index/learning-to-reason-with-llms/</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zelikman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Goodman</surname><given-names>ND</given-names> </name></person-group><article-title>STaR: bootstrapping reasoning with reasoning</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 28, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.14465</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Temsah</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Jamal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alhasan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Temsah</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Malki</surname><given-names>KH</given-names> </name></person-group><article-title>OpenAI o1-preview vs. ChatGPT in healthcare: a new frontier in medical AI reasoning</article-title><source>Cureus</source><year>2024</year><month>10</month><volume>16</volume><issue>10</issue><fpage>e70640</fpage><pub-id pub-id-type="doi">10.7759/cureus.70640</pub-id><pub-id pub-id-type="medline">39359332</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Kasai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kasai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yamada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Radev</surname><given-names>D</given-names> </name></person-group><article-title>Evaluating GPT-4 and ChatGPT on Japanese medical licensing examinations</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 31, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.18027</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Responses of GPT-4 and OpenAI-o1 to Japanese National Medical Licensing Examination questions.</p><media xlink:href="medinform_v13i1e68527_app1.docx" xlink:title="DOCX File, 37 KB"/></supplementary-material></app-group></back></article>