<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e76773</article-id><article-id pub-id-type="doi">10.2196/76773</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Toward Cross-Hospital Deployment of Natural Language Processing Systems: Model Development and Validation of Fine-Tuned Large Language Models for Disease Name Recognition in Japanese</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Shimizu</surname><given-names>Seiji</given-names></name><degrees>MEng</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nishiyama</surname><given-names>Tomohiro</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nagai</surname><given-names>Hiroyuki</given-names></name><degrees>MA</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wakamiya</surname><given-names>Shoko</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Aramaki</surname><given-names>Eiji</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Nara Institute of Science and Technology</institution><addr-line>8916-5, Takayama-cho</addr-line><addr-line>Ikoma-shi, Nara</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Klann</surname><given-names>Jeffrey</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Singh</surname><given-names>Ayush</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Siino</surname><given-names>Marco</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Eiji Aramaki, PhD, Nara Institute of Science and Technology, 8916-5, Takayama-cho, Ikoma-shi, Nara, 630-0192, Japan, 81 743-72-5250; <email>aramaki@is.naist.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>8</day><month>7</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e76773</elocation-id><history><date date-type="received"><day>30</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>10</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>11</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Seiji Shimizu, Tomohiro Nishiyama, Hiroyuki Nagai, Shoko Wakamiya, Eiji Aramaki. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 8.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e76773"/><abstract><sec><title>Background</title><p>Disease name recognition is a fundamental task in clinical natural language processing, enabling the extraction of critical patient information from electronic health records. While recent advances in large language models (LLMs) have shown promise, most evaluations have focused on English, and little is known about their robustness in low-resource languages such as Japanese. In particular, whether these models can perform reliably on previously unseen in-hospital data, which differs from training data in writing styles and clinical contexts, has not been thoroughly investigated.</p></sec><sec><title>Objective</title><p>This study evaluated the robustness of fine-tuned LLMs for disease name recognition in Japanese clinical notes, with a particular focus on their performance on in-hospital data that was not included during training.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used two corpora for this study: (1) a publicly available set of Japanese case reports denoted as CR, and (2) a newly constructed corpus of progress notes, denoted as PN, written by ten physicians to capture stylistic variations of in-hospital clinical notes. To reflect real-world deployment scenarios, we first fine-tuned models on CR. Specifically, we compared a LLM and a baseline-masked language model (MLM). These models were then evaluated under two conditions: (1) on CR, representing the in-domain (ID) setting with the same document type, similar to training, and (2) on PN, representing the out-of-domain (OOD) setting with a different document type. Robustness was assessed by calculating the performance gap (ie, the performance drop from in-domain to out-of-domain settings).</p></sec><sec sec-type="results"><title>Results</title><p>The LLM demonstrated greater robustness, with a smaller performance gap in <italic>F</italic><sub>1</sub>-scores (ID&#x2013;OOD = &#x2212;8.6) compared to the MLM baseline performance (ID&#x2013;OOD = &#x2212;13.9). This indicated more stable performance across ID and OOD settings, highlighting the effectiveness of fine-tuned LLMs for reliable use in diverse clinical settings.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Fine-tuned LLMs demonstrate superior robustness for disease name recognition in Japanese clinical notes, with a smaller performance gap. These findings highlight the potential of LLMs as reliable tools for clinical natural language processing in low-resource language settings and support their deployment in real-world health care applications, where diversity in documentation is inevitable.</p></sec></abstract><kwd-group><kwd>clinical NLP</kwd><kwd>Japanese language</kwd><kwd>named entity recognition</kwd><kwd>large language models</kwd><kwd>out-of-domain robustness</kwd><kwd>clinical corpus</kwd><kwd>clinical natural language processing</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Clinical notes contain a vast amount of information that is not captured in structured fields of electronic health records (EHRs) [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Natural language processing (NLP) techniques have become essential for unlocking this rich, unstructured data [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Among these, named entity recognition (NER)&#x2014;a task that identifies clinical entities such as disease names in text&#x2014;plays a vital role in extracting key clinical information, which is essential for understanding patients&#x2019; medical conditions [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. For instance, disease name recognition can be leveraged to detect adverse drug reactions from EHRs for post-marketing surveillance [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Recent advances leveraging fine-tuned masked language models (MLMs) such as BERT, have achieved state-of-the-art performance in clinical NER tasks, often outperforming prompt-based in-context learning (ICL) of large language models (LLMs) [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. However, MLMs fine-tuned for disease name recognition tend to experience notable performance drops on unseen in-hospital data [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Given that clinical NLP systems are expected to operate reliably across diverse clinical settings, understanding and improving robustness&#x2014;that is, whether models can perform reliably on previously unseen clinical notes&#x2014;is a critical research objective.</p><p>When fine-tuned, LLMs have shown competitive or slightly superior performance compared to MLMs in NER tasks [<xref ref-type="bibr" rid="ref12">12</xref>]. Given their exposure to a broader and more diverse range of linguistic patterns during pretraining, LLMs are expected to exhibit resilience to stylistic variations. However, the extent to which fine-tuning improves their robustness over MLMs remains underexplored, particularly in languages other than English [<xref ref-type="bibr" rid="ref13">13</xref>]. One reason for this research gap is the lack of corpora that reflect the realistic documentation styles of in-hospital clinical notes. These data are challenging to obtain due to privacy concerns and institutional restrictions.</p><p>In this study, we investigate the robustness of fine-tuned LLMs with a focus on disease name recognition in Japanese clinical notes. To facilitate an evaluation of robustness to unseen in-hospital data, we constructed a dedicated clinical corpus comprising progress notes (PN) authored by ten individual physicians from different clinical institutions, reflecting diverse in-hospital documentation styles. To reflect real-world deployment scenarios, we trained the models on publicly available case reports (CR) and evaluated them under two conditions: (1) on CR, representing the in-domain (ID) setting with the same document type as in training, and (2) on PN, representing the out-of-domain (OOD) setting with a different document type. This cross-document evaluation allows us to assess the models&#x2019; robustness to real-world variability in documentation styles, capturing the challenges introduced by the diverse writing practices found in in-hospital clinical notes.</p><p>Experimental results demonstrated that the fine-tuned LLM&#x2014;specifically LLaMA-3.1 [<xref ref-type="bibr" rid="ref14">14</xref>]&#x2014;outperforms the MLM baseline (Bidirectional Encoder Representations from Transformers, BERT) [<xref ref-type="bibr" rid="ref15">15</xref>], not only in the ID setting but also under OOD conditions. The LLM also exhibits a smaller performance gap between ID and OOD settings, indicating stronger robustness compared to the MLM. Further analysis reveals that LLMs are more resilient to stylistic diversity among clinicians, showing reduced performance fluctuation across different physicians. These findings underscore the potential of fine-tuned LLMs as more reliable tools for real-world clinical applications, particularly where robustness to diverse and previously unseen clinical notes is essential.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>Our primary research question was to evaluate whether fine-tuned LLMs remain robust when applied to previously unseen clinical notes. To address this, we compared the performance gap, defined as the performance difference between ID setting and OOD settings between a fine-tuned LLM and a fine-tuned MLM, which serves as a state-of-the-art baseline for clinical NER.</p><p>The overview of the evaluation pipeline is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. We evaluated model performance on the task of disease name recognition in Japanese clinical notes. Both LLMs and MLMs are first fine-tuned on a training set sampled from one document type (eg, case reports). Evaluation is then conducted on two distinct test sets: one ID set sampled from the same document type as the training data and one OOD set sampled from a different document type (eg, progress notes).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the evaluation pipeline. Models were fine-tuned on CR and evaluated on both CR, representing in-domain (ID) and PN, representing out-of-domain (OOD) test sets for disease name recognition in Japanese clinical notes. BERT: Bidirectional Encoder Representations from Transformers; HER2: human epidermal growth factor receptor 2; ICI: Immune checkpoint inhibitor; LLM: large language model; MLM: masked language model; SOX: SRY-related HMG-box.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e76773_fig01.png"/></fig></sec><sec id="s2-2"><title>Materials</title><p>To represent ID and OOD settings, we used two datasets: an existing publicly available corpus of case reports and a newly constructed corpus comprised of progress notes.</p><list list-type="order"><list-item><p>Case reports (CR): A publicly available dataset consisting of Japanese case reports comprising 1898 sentences across 148 documents annotated with clinical entities [<xref ref-type="bibr" rid="ref16">16</xref>].</p></list-item><list-item><p>Progress notes (PN): A dedicated, newly constructed corpus of progress notes, comprising 1094 sentences across 100 documents annotated with disease entities.</p></list-item></list><p>For the construction of PN, we first curated 10 diverse board exam-style cases. To reflect real-world clinical documentation, these cases were then rewritten by ten physicians, each contributing ten unique documents. The physicians were instructed to adapt the cases to authentic in-hospital clinical note styles from their clinical practice, emphasizing realistic writing styles and varying levels of readability. In total, 1094 sentences (100 documents) across 10 physicians and 10 clinical cases were created. The created PN were then annotated for disease name entities by two experienced annotators, following the same annotation guidelines as CR [<xref ref-type="bibr" rid="ref17">17</xref>]. We summarize the titles of the board exam-style cases and the top 3 most frequent disease entities per case in <xref ref-type="table" rid="table1">Table 1</xref>.</p><p>We assessed the annotation quality of PN by calculating interannotator agreement using two criteria: exact and partial span matching. Exact matching required both annotators to identify the same entity with identical span boundaries, while partial matching allowed for overlapping spans, acknowledging minor variations in boundary selection. Based on a comparison of annotations from 10 randomly sampled documents, the agreement between annotators was 0.70 (61/87) for exact matching and 0.82 (71/87) for partial matching, suggesting a high level of consistency and reliable annotation quality.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Titles of the curated board exam-style cases and top 3 frequent disease entities.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Title and top 3 disease entities</td><td align="left" valign="bottom">Frequency (n)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">&#x201C;&#x6025;&#x6027;&#x866B;&#x5782;&#x708E;&#x201D; (Acute appendicitis)</td></tr><tr><td align="left" valign="top">&#x201C;&#x6025;&#x6027;&#x866B;&#x5782;&#x708E;&#x201C; (Acute appendicitis)</td><td align="left" valign="top">11</td></tr><tr><td align="left" valign="top">&#x201C;&#x8179;&#x75DB;&#x201C; (Abdominal pain)</td><td align="left" valign="top">6</td></tr><tr><td align="left" valign="top">&#x201C;&#x7A7F;&#x5B54;&#x3084;&#x81BF;&#x760D;&#x201C; (Perforation or abscess)</td><td align="char" char="." valign="top">5</td></tr><tr><td align="left" valign="top" colspan="2">&#x201C;&#x9032;&#x884C;&#x80C3;&#x304C;&#x3093;&#x201C; (Advanced gastric cancer)</td></tr><tr><td align="left" valign="top">&#x201C;&#x9032;&#x884C;&#x80C3;&#x304C;&#x3093;&#x201C; (Advanced gastric cancer)</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">&#x201C;&#x547C;&#x5438;&#x56F0;&#x96E3;&#x201C; (Dyspnea)</td><td align="char" char="." valign="top">7</td></tr><tr><td align="left" valign="top">&#x201C;&#x809D;&#x8EE2;&#x79FB;&#x201C; (Liver metastasis)</td><td align="char" char="." valign="top">5</td></tr><tr><td align="left" valign="top" colspan="2">&#x201C;&#x809D;&#x81BF;&#x760D;&#x201C; (Liver abscess)</td></tr><tr><td align="left" valign="top">&#x201C;&#x809D;&#x81BF;&#x760D;&#x201C; (Liver abscess)</td><td align="left" valign="top">15</td></tr><tr><td align="left" valign="top">&#x201C;&#x81BF;&#x760D;&#x201C; (Abscess)</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">&#x201C;&#x5026;&#x6020;&#x611F;&#x201C; (Fatigue)</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top" colspan="2">&#x201C;&#x6708;&#x7D4C;&#x904E;&#x591A;&#x306B;&#x4F34;&#x3046;&#x8CA7;&#x8840;&#x201C; (Anemia due to menorrhagia)</td></tr><tr><td align="left" valign="top">&#x201C;&#x5B50;&#x5BAE;&#x7B4B;&#x816B;&#x201C; (Uterine fibroids)</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">&#x201C;&#x8CA7;&#x8840;&#x201C; (Anemia)</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">&#x201C;&#x6708;&#x7D4C;&#x904E;&#x591A;&#x201C; (Menorrhagia)</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top" colspan="2">&#x201C;&#x80BA;&#x304C;&#x3093;&#x201C; (Lung cancer)</td></tr><tr><td align="left" valign="top">&#x201C;&#x809D;&#x8EE2;&#x79FB;&#x201C; (Liver metastasis)</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">&#x201C;&#x80BA;&#x5C0F;&#x7D30;&#x80DE;&#x764C;&#x201C; (Small cell lung cancer)</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">&#x201C;&#x4F4E;Na&#x8840;&#x75C7;&#x201C; (Hyponatremia)</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top" colspan="2">&#x201C;IE (&#x611F;&#x67D3;&#x6027;&#x5FC3;&#x5185;&#x819C;&#x708E;&#xFF09;&#x201C; (Infective endocarditis)</td></tr><tr><td align="left" valign="top">&#x201C;&#x304F;&#x3082;&#x819C;&#x4E0B;&#x51FA;&#x201C; (Subarachnoid hemorrhage)</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">SAH (Subarachnoid hemorrhage)</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top">&#x201C;&#x75A3;&#x8D05;&#x201C; (Vegetation - cardiac)</td><td align="left" valign="top">6</td></tr><tr><td align="left" valign="top" colspan="2">&#x201C;&#x8AA4;&#x56A5;&#x6027;&#x80BA;&#x708E;&#x201C;&#xFF0B;COPD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (Aspiration Pneumonia with COPD)</td></tr><tr><td align="left" valign="top">COPD</td><td align="char" char="." valign="top">12</td></tr><tr><td align="left" valign="top">&#x201C;&#x8AA4;&#x56A5;&#x6027;&#x80BA;&#x708E;&#x201C; (Aspiration pneumonia)</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">&#x201C;&#x6D78;&#x6F64;&#x5F71;&#x201C; (Infiltrates)</td><td align="left" valign="top">6</td></tr><tr><td align="left" valign="top" colspan="2">&#x201C;&#x30AF;&#x30E2;&#x819C;&#x4E0B;&#x51FA;&#x8840;&#x201C; (Subarachnoid hemorrhage)</td></tr><tr><td align="left" valign="top">SAH (Subarachnoid hemorrhage)</td><td align="left" valign="top">14</td></tr><tr><td align="left" valign="top">&#x201C;&#x982D;&#x75DB;&#x201C; (Headache)</td><td align="left" valign="top">9</td></tr><tr><td align="left" valign="top">&#x201C;&#x304F;&#x3082;&#x819C;&#x4E0B;&#x51FA;&#x8840;&#x201C; (Subarachnoid hemorrhage)</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="top" colspan="2">&#x201C;&#x5927;&#x8178;&#x304C;&#x3093;&#x201C; (Colorectal cancer)</td></tr><tr><td align="left" valign="top">&#x201C;Is&#x30DD;&#x30EA;&#x30FC;&#x30D7;&#x201C; (Is polyp)</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top">&#x201C;&#x5927;&#x8178;&#x304C;&#x3093;&#x201C; (Colorectal cancer)</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top">Well-differentiated tubular adenocarcinoma in tubular adenoma</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top" colspan="2">AMI (acute myocardial infarction)</td></tr><tr><td align="left" valign="top">&#x201C;&#x80F8;&#x75DB;&#x201C; (Chest pain)</td><td align="left" valign="top">15</td></tr><tr><td align="left" valign="top">AMI</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">&#x201C;&#x58C1;&#x904B;&#x52D5;&#x4F4E;&#x4E0B;&#x201C; (Wall motion abnormality)</td><td align="left" valign="top">5</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>COPD: chronic obstructive pulmonary disease</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Models and Baselines</title><p>We evaluate fine-tuned MLMs and LLMs on the task of disease name recognition in Japanese clinical notes, comparing their performance under ID and OOD settings. In addition, we evaluated LLMs using in-context learning (ICL) through zero-shot and few-shot prompting to assess the contribution of fine-tuning to improving LLM performance.</p><p>Fine-Tuning: We fine-tuned two models for the NER task: (1) &#x201C;bert-base-japanese-v3&#x201D; [<xref ref-type="bibr" rid="ref18">18</xref>], based on BERT [<xref ref-type="bibr" rid="ref15">15</xref>] as a strong MLM baseline, and (2) Swallow-Instruct-v0.2 [<xref ref-type="bibr" rid="ref19">19</xref>], based on LLaMA-3.1 (version 8B; Meta) [<xref ref-type="bibr" rid="ref14">14</xref>], which is a Japanese-instruction-tuned LLM. Both were subsequently fine-tuned on the ID training set. &#x201C;bert-base-japanese-v3&#x201D; was chosen as a strong and widely used baseline for Japanese-language tasks, providing a representative benchmark for traditional transformer&#x2013;based MLMs that had been validated in clinical NLP [<xref ref-type="bibr" rid="ref20">20</xref>]. In contrast, &#x201C;Swallow-Instruct-v0.2&#x201D; (version 0.2; Tokyo Institute of Technology) was selected to evaluate the potential of recent instruction&#x2013;tuned LLMs, which are designed to better follow task-specific instructions and generalize across diverse inputs. Built on Llama 3.1 8B through continual pretraining, it was trained on a curated instruction corpus featuring multiturn dialogue and multilingual tasks with a particular focus on enhancing Japanese language capabilities.</p><p>For BERT, we adopted a standard sequence labeling approach, using the &#x201C;CLS&#x201D; token representations followed by a linear classification layer to predict BIO-tagged labels for disease entities. In contrast, for LLaMA-3.1, we followed a generation-based NER framework: the model was prompted with an instruction and clinical note, and it generated the same sentence with inline entity tags, enabling span-level extraction in a natural language generation format. To adapt LLaMA-3.1 to the NER task efficiently, we applied additional fine-tuning using Low-Rank Adaptation (LoRA) [<xref ref-type="bibr" rid="ref21">21</xref>], allowing parameter-efficient fine-tuning without modifying the full set of model weights.</p><p>In-context learning (ICL): We evaluated LLaMA-3.1 and GPT-4o [<xref ref-type="bibr" rid="ref22">22</xref>] under zero-shot and few-shot settings as baselines. An example of a prompt used for ICL is presented in <xref ref-type="other" rid="box1">Textbox 1</xref>. Recently, extensive efforts have been made to optimize prompt design in the field of NLP [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. In this experiment, we provide the models with an annotation guideline that included entity definitions, task instructions, and illustrative examples, following prior work on medical NER [<xref ref-type="bibr" rid="ref9">9</xref>]. For zero-shot learning, the model was provided with task instructions and annotation guidelines only, without any annotated examples. For few-shot learning, a single annotated clinical note was randomly selected from the training data and included in the prompt. Here, LLaMA-3.1 represents an open-source LLM with accessible model weights, allowing evaluation across both fine-tuning and in-context learning scenarios. In contrast, GPT-4o was evaluated only in zero-shot and few-shot settings due to its proprietary nature, serving as a reference for the in-context learning performance of an LLM with the highest model capacity in our study.</p><boxed-text id="box1"><title> Textbox 1. Prompt example for ICL.</title><p>### Annotation Guideline</p><p>Definition of disease names</p><list list-type="order"><list-item><p>When the lesion or symptom has actually been observed in the patient</p></list-item><list-item><p>When it is suspected that the patient may have the lesion or symptom (eg, proposed as a differential diagnosis)</p></list-item><list-item><p>&#x2026;</p></list-item></list><p>### Examples</p><p>Text: In November last year, pleural effusion appeared and increased, but decreased after starting furosemide.</p><p>Annotation: In November last year, &#x003C;d pleural effusion &#x003C;/d&#x003E;appeared and increased, but decreased after starting furosemide.</p><p>&#x2026;</p><p>### Task Instruction</p><p>Based on the above explanation and examples, please annotate the following text</p><p>&#x2026;</p><p>Text: The nodule shadow in the right upper lobe of the lung slightly increased.</p></boxed-text></sec><sec id="s2-4"><title>Comparison Settings</title><p>We used our clinical corpora in a cross-domain evaluation setup to assess both ID and OOD robustness:</p><p>ID: Samples from one document type were split 8:2 into training and evaluation sets. This setting reflects standard model development conditions, where training and test data share similar clinical notes.</p><p>OOD: The corpus from another document type was used in its entirety for evaluation. This reflects real-world deployment scenarios where models must process previously unseen clinical notes with varying writing styles and vocabularies.</p><p>In addition to the setting where CR are used for training and PN for evaluation, we also included the reverse scenario. This resulted in four distinct evaluation configurations: CR&#x2192;CR, CR&#x2192;PN, PN&#x2192;PN, and PN&#x2192;CR. The difference between ID and OOD performance (&#x0394;) is calculated as &#x0394;=ID&#x2212;OOD. This represents the performance gap, which measures how well the model generalizes to unseen clinical notes, compared to the data it was trained on. A smaller difference indicates that the model is robust, even when faced with clinical notes with different writing styles or from different clinical cases. Hyperparameters used for fine-tuning are summarized in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Hyperparameters used for fine-tuning.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameter description</td><td align="left" valign="bottom">LLaMa-3.1</td><td align="left" valign="bottom">BERT<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Evaluation batch size per device</td><td align="left" valign="top">&#x2013;<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">Learning rate</td><td align="left" valign="top">2e-4</td><td align="left" valign="top">2e-5</td></tr><tr><td align="left" valign="top">LoRA<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> dropout rate</td><td align="left" valign="top">0.05</td><td align="left" valign="top">&#x2013;</td></tr><tr><td align="left" valign="top">LoRA rank (number of adaptation dims)</td><td align="left" valign="top">16</td><td align="left" valign="top">&#x2013;</td></tr><tr><td align="left" valign="top">LoRA scaling factor</td><td align="left" valign="top">64</td><td align="left" valign="top">&#x2013;</td></tr><tr><td align="left" valign="top">Maximum gradient norm (clipping)</td><td align="left" valign="top">0.3</td><td align="left" valign="top">&#x2013;</td></tr><tr><td align="left" valign="top">Maximum sequence length</td><td align="left" valign="top">3000</td><td align="left" valign="top">512</td></tr><tr><td align="left" valign="top">Number of gradient accumulation steps</td><td align="left" valign="top">4</td><td align="left" valign="top">&#x2013;</td></tr><tr><td align="left" valign="top">Number of training epochs</td><td align="left" valign="top">2</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top">Optimizer used</td><td align="left" valign="top">AdamW</td><td align="left" valign="top">AdamW</td></tr><tr><td align="left" valign="top">Training batch size per device</td><td align="left" valign="top">4</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">Warmup ratio for learning rate</td><td align="left" valign="top">0.05</td><td align="left" valign="top">&#x2013;</td></tr><tr><td align="left" valign="top">Weight decay for regularization</td><td align="left" valign="top">&#x2013;</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top">Number of warmup steps</td><td align="left" valign="top">&#x2013;</td><td align="left" valign="top">0</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table2fn2"><p><sup>b</sup>Not applicable.</p></fn><fn id="table2fn3"><p><sup>c</sup>LoRA: Low-Rank Adaptation.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-5"><title>Ethical Considerations</title><p>This study did not involve experiments with human subjects, and no personally identifiable information was used at any stage. The clinical notes were physician-authored, based on board exam cases that are publicly available. Therefore, there are no ethical concerns related to patient privacy or informed consent in this research.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Study Findings</title><p><xref ref-type="table" rid="table3">Table 3</xref> shows the findings of ID and OOD evaluation. All models were evaluated using the microaveraged <italic>F</italic><sub>1</sub>-score, focusing on exact span matches of disease name entities. All results are averaged over three runs with different random seeds. For GPT-4o, we set the generation temperature to zero to ensure deterministic outputs and only ran the evaluation once due to annotation budget constraints.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Evaluation results in micro <italic>F</italic><sub>1</sub>-scores with standard deviations.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">CR<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>, mean (SD)</td><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">PN<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>, mean (SD)</td><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top">&#x2192;CR<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> (ID)<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">&#x2192;PN<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> (OOD)<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">&#x0394;CR<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">&#x2192;PN (ID)</td><td align="left" valign="top">&#x2192;CR (OOD)</td><td align="left" valign="top">&#x0394;PN<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">LLaMA-3.1 (Zero-shot)</td><td align="left" valign="top">27.4 (0.3)</td><td align="left" valign="top">20.4 (0.1)</td><td align="left" valign="top">&#x2212;7.0</td><td align="left" valign="top">15.5 (0.4)</td><td align="left" valign="top">27.1 (0.1)</td><td align="left" valign="top">11.6</td></tr><tr><td align="left" valign="top">LLaMA-3.1 (Few-shot)</td><td align="left" valign="top">32.6 (1.6)</td><td align="left" valign="top">30.5 (0.7)</td><td align="left" valign="top">&#x2212;2.1</td><td align="left" valign="top">37.0 (4.8)</td><td align="left" valign="top">35.9 (0.5)</td><td align="left" valign="top">&#x2212;11.0</td></tr><tr><td align="left" valign="top">GPT-4o (Zero-shot)</td><td align="left" valign="top">49.5 (0.0)</td><td align="left" valign="top">47.7 (0.0)</td><td align="left" valign="top">&#x2212;1.8</td><td align="left" valign="top">42.0 (0.0)</td><td align="left" valign="top">50.8 (0.0)</td><td align="left" valign="top">8.8</td></tr><tr><td align="left" valign="top">GPT-4o (Few-shot)</td><td align="left" valign="top">53.4 (0.0)</td><td align="left" valign="top">49.9 (0.0)</td><td align="left" valign="top">&#x2212;3.5</td><td align="left" valign="top">56.2 (0.0)</td><td align="left" valign="top">54.4 (0.0)</td><td align="left" valign="top">&#x2212;1.8</td></tr><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup>(Fine-tuned)</td><td align="left" valign="top">73.7 (0.2)</td><td align="left" valign="top">59.8 (0.2)</td><td align="left" valign="top">&#x2212;13.9</td><td align="left" valign="top">79.7 (1.8)</td><td align="left" valign="top">55.5 (1.7)</td><td align="left" valign="top">&#x2212;24.2</td></tr><tr><td align="left" valign="top">LLaMA-3.1 (Fine-tuned)</td><td align="left" valign="top">78.4 (0.5)</td><td align="left" valign="top">69.8 (0.6)</td><td align="left" valign="top">&#x2212;8.6</td><td align="left" valign="top">81.9 (0.7)</td><td align="left" valign="top">67.2 (0.5)</td><td align="left" valign="top">&#x2212;14.7</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>CR: case reports.</p></fn><fn id="table3fn2"><p><sup>b</sup>PN: progress reports.</p></fn><fn id="table3fn3"><p><sup>c</sup>ID: in-domain.</p></fn><fn id="table3fn4"><p><sup>d</sup>OOD: out-of-domain.</p></fn><fn id="table3fn5"><p><sup>e</sup>&#x0394;CR: difference in csse reports.</p></fn><fn id="table3fn6"><p><sup>f</sup>&#x0394;PN: difference in progress notes.</p></fn><fn id="table3fn7"><p><sup>g</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Comparison Between Fine-Tuning and ICL</title><p>The fine-tuned LLaMA-3.1 consistently outperformed its zero-shot and few-shot counterparts, achieving the highest <italic>F</italic><sub>1</sub>-scores across all evaluation settings. These results highlight the effectiveness of fine-tuning for clinical NER. In contrast, vanilla LLaMA-3.1 exhibited limited performance in zero-shot and few-shot scenarios. Notably, in the few-shot setting, the model showed a decline in performance when applied to OOD data.</p><p>GPT-4o demonstrated strong few-shot performance (ie, 53.4 on CR&#x2192;CR and 56.2 on PN&#x2192;PN) despite having no access to training data, highlighting the robustness of large-scale foundation models. However, these models still underperformed compared to the fine-tuned BERT, consistent with previous findings that task-specific fine-tuning often outperforms in-context learning in specialized domains like clinical NER [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. These findings illustrate that while ICL can provide a competitive baseline with minimal data, fine-tuning remains essential.</p></sec><sec id="s3-3"><title>Comparison Between MLM and LLM</title><p>The fine-tuned LLaMA-3.1 outperformed the BERT baseline across all ID and OOD settings, demonstrating superior robustness to the previously unseen document type. Specifically, the fine-tuned LLaMA-3.1 exhibited a smaller performance gap (&#x0394;CR=&#x2212;8.6and &#x0394;PN=&#x2212;14.7) compared to BERT (&#x0394;CR=&#x2212;13.9 and &#x0394;PN=&#x2212;24.2), suggesting its greater stability for practical use in diverse clinical settings.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Experimental results demonstrate that the fine-tuned LLM exhibits strong robustness, maintaining relatively stable performance even when applied to previously unseen progress notes. To better understand the factors contributing to this robustness, we further decomposed it into two aspects: (1) robustness to stylistic variation, (ie, variations across physicians) and (2) robustness to variation across clinical cases. Our analysis reveals that the LLM was particularly robust to stylistic differences, while showing greater sensitivity to the differences in clinical cases.</p><p>We also conducted an error analysis to examine the qualitative improvements of the LLM over the MLM baseline performance. Our findings suggest that the LLM benefits from its generative approach, which allows it to mark entities inline within the sentence, as opposed to relying on a classification head over token representations, as in MLMs. This generative approach enables the LLM to more accurately extract entity spans, especially in stylistically diverse clinical notes.</p></sec><sec id="s4-2"><title>Impact of Physician and Clinical Case Variation</title><p>To examine the robustness of the fine-tuned LLM to variations in writing styles and clinical cases, we stratified the performance of the fine-tuned LLaMA-3.1 and BERT in the CR&#x2192;PN setting by physician and clinical case. The stratification process is summarized in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Overview of the evaluation with stratification. Performance of fine-tuned LLaMA-3.1 and BERT was stratified by physician and clinical case to assess robustness to writing style and clinical case variations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e76773_fig02.png"/></fig><sec id="s4-2-1"><title>Physician-Based</title><p>Performance was evaluated for each physician, with ten clinical notes authored per individual. For instance, the performance of models is averaged across <inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>P</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> to <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>P</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>10</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> for the i-th physician. The variation in these stratified results allows us to assess the model&#x2019;s sensitivity to variations in individual writing styles.</p></sec><sec id="s4-2-2"><title>Clinical Case-Based</title><p>Performance was also evaluated for each clinical case modeled after board exam-style scenarios (eg, acute appendicitis). The model&#x2019;s performance was averaged over ten notes from different physicians, for example, <inline-formula><mml:math id="ieqn3"><mml:msub><mml:mrow><mml:mi>P</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mi> </mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> to <inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>P</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mn>10</mml:mn><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> for the j-th clinical case. This stratification enables an analysis of the model&#x2019;s ability to generalize to clinical-case-specific disease entities.</p><p><xref ref-type="fig" rid="figure3">Figure 3</xref> presents the distribution of <italic>F</italic><sub>1</sub>-scores stratified by physician and clinical case in the CR&#x2192;PN setting for both the fine-tuned LLaMa-3.1 and BERT. The spread of each box and the range of whiskers reflect performance variability, while dots indicate outliers beyond 1.5-times the interquartile range. Narrower boxes and smaller ranges indicate higher consistency, while outliers and wider spreads highlight sensitivity to writing style or clinical case variation.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Distribution of <italic>F</italic><sub>1</sub>-scores in CR&#x2192;PN stratified by physician and clinical case. BERT: Bidirectional Encoder Representations from Transformers; CR: case report; PN: progress note.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e76773_fig03.png"/></fig></sec><sec id="s4-2-3"><title>Physician Variance</title><p>The fine-tuned LLaMA-3.1 demonstrated consistently strong performance across different physicians, with relatively low variance in <italic>F</italic><sub>1</sub>-scores. This is indicated in the narrow range between the maximum and minimum values in the box plot. In contrast, BERT&#x2019;s performance varied more widely, with significant performance drops in some physicians&#x2019; notes, as indicated by extreme outliers and low minimum values. This suggests that the fine-tuned LLaMA-3.1 is more robust to stylistic differences, potentially due to its exposure to a broader and more diverse range of linguistic patterns during pretraining.</p></sec><sec id="s4-2-4"><title>Clinical Case Variance</title><p>When stratified by clinical case, both models showed greater variability compared to writing style. This is evidenced by wider boxes and larger ranges between the maximum and minimum values in the box plots. These results highlight the increased difficulty of generalizing to domain-specific disease names. While the fine-tuned LLaMA-3.1 generally achieved higher average <italic>F</italic><sub>1</sub>-score across clinical cases, it also experienced sharp drops in certain cases, indicating that it remains sensitive to clinical-case-specific variation. This underscores the persistent challenge in processing previously unseen clinical cases, even for large instruction-tuned models.</p><p>These findings underscore the relative strength of the fine-tuned LLaMA-3.1 in handling clinical stylistic variation. At the same time, they point to the need for further work in addressing performance gaps in different clinical cases.</p></sec></sec><sec id="s4-3"><title>Error Analysis</title><p>To examine the qualitative improvements of the LLM over the MLM baselines, we conducted error analysis in the CR&#x2192;PN setting. Based on the observation that the fine-tuned BERT has lower precision (51.8) compared to the fine-tuned LLaMA-3.1 (70.8), both models achieved similar recall scores (70.7 for BERT and 68.9 for LLaMA-3.1), we focused on false positive cases for the analysis.</p><p>Representative error examples are summarized in <xref ref-type="table" rid="table4">Table 4</xref>. From a manual inspection, we observed that BERT frequently misclassified nondisease clinical entities such as laboratory tests and biomarkers as disease mentions. For instance, in the sentence &#x201C;Tumor markers also decreased gradually (&#x5F90;&#x3005;&#x306B;&#x816B;&#x760D;&#x30DE;&#x30FC;&#x30AB;&#x30FC;&#x3082;&#x4F4E;&#x4E0B;&#x3057;),&#x201D; BERT incorrectly predicted two spans: &#x201C;tumor markers (&#x816B;&#x760D;&#x30DE;&#x30FC;&#x30AB;&#x201D;&#x30FC;)&#x201D; and &#x201C;decrease (&#x4F4E;&#x4E0B;).&#x201D; Both terms describe laboratory findings rather than disease entities. In another example, in the sentence &#x201C;Therefore, with regard to platelets (&#x3053;&#x306E;&#x305F;&#x3081;&#x8840;&#x5C0F;&#x677F;&#x306B;&#x5BFE;&#x3057;&#x3066;&#x306F;),&#x201D; BERT erroneously extracted &#x201C;platelets (&#x8840;&#x5C0F;&#x677F;),&#x201D; which refers to a blood cell type rather than a pathological condition. These examples illustrate that BERT often struggles to distinguish diagnostically relevant clinical phrases from true disease mentions, leading to false positive predictions.</p><p>In addition to entity type confusion, BERT also often struggled to capture the complete span of disease mentions, frequently producing boundary errors or partial matches that failed to align with the gold-standard annotations. For instance, in the sentence &#x201C;Hb 8.1 g/dL and moderate nutritional disorder were observed (Hb8.1g/dl&#x3068;&#x4E2D;&#x7A0B;&#x5EA6;&#x306E;&#x6804;&#x990A;&#x969C;&#x5BB3;&#x3092;&#x8A8D;&#x3081;&#x305F;),&#x201D; BERT incorrectly predicted only the prefix of a laboratory value, &#x201C;Hb8,&#x201D; entirely missing the actual disease mention &#x201C;&#x6804;&#x990A;&#x969C;&#x5BB3; (nutritional disorder).&#x201D; In another example, &#x201C;The drainage volume was excessive (&#x6392;&#x6DB2;&#x91CF;&#x304C;&#x904E;&#x591A;&#x3067;&#x3042;&#x3063;&#x305F;),&#x201D; BERT extracted only the character &#x201C;&#x591A; (excessive),&#x201D; omitting the full phrase &#x201C;&#x6392;&#x6DB2;&#x91CF;&#x304C;&#x904E;&#x591A; (excessive drainage volume).&#x201D; These examples illustrate how slight shifts in input text can lead to misaligned token representations in MLM&#x2019;s embedding space, resulting in fragmented or incomplete entity predictions.</p><p>In contrast, the LLM demonstrates greater flexibility in capturing complete spans of disease mentions. Unlike the MLM, which often struggles with partial matches, the LLM&#x2019;s generative approach allows entities to be marked directly and seamlessly within the sentence. For example, in the sentence &#x201C;Hb8.1g/dl&#x3068;&#x4E2D;&#x7A0B;&#x5EA6;&#x306E;&#x003C;d&#x003E;&#x6804;&#x990A;&#x969C;&#x5BB3;&#x003C;/d&#x003E;&#x3092;&#x8A8D;&#x3081;&#x305F;&#x201D; (translated as &#x201C;Hb 8.1 g/dL and moderate &#x003C;d&#x003E;nutritional disorder&#x003C;/d&#x003E;were observed&#x201D;), the disease mention &#x201C;&#x6804;&#x990A;&#x969C;&#x5BB3; (nutritional disorder)&#x201D; is correctly and fully captured within the sentence using inline entity tags. This inline tagging strategy enables the model to stably extract entire disease names, even when clinical notes vary in writing style.</p><p>The LLM&#x2019;s ability to overcome the errors observed in BERT predictions likely stems from its different learning paradigm: rather than relying solely on token-level classification based on fixed input embeddings, the LLM generates structured outputs conditioned on the full context of the input. This generative approach allows the LLM to better maintain entity span prediction coherence over varying document types and incorporate broader sentence-level semantics into prediction. Consequently, the LLM achieves more robust and accurate extraction performance compared to the MLM baseline, particularly in OOD settings.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Examples of BERT prediction errors in the CR&#x2192;PN setting. Each row shows a sentence (left), predicted entity spans (middle), and the correct gold annotations (right).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Example sentence (English/Japanese)</td><td align="left" valign="bottom">Prediction</td><td align="left" valign="bottom">Gold annotation</td></tr></thead><tbody><tr><td align="left" valign="top">Tumor markers also decreased gradually/&#x816B;&#x760D;&#x30DE;&#x30FC;&#x30AB;&#x30FC;&#x3082;&#x4F4E;&#x4E0B;&#x3057;</td><td align="left" valign="top">tumor markers/&#x816B;&#x760D;&#x30DE;&#x30FC;&#x30AB;&#x30FC;, decrease/&#x4F4E;&#x4E0B;</td><td align="left" valign="top">None</td></tr><tr><td align="left" valign="top">Therefore, with regard to platelets/&#x3053;&#x306E;&#x305F;&#x3081;&#x8840;&#x5C0F;&#x677F;&#x306B;&#x5BFE;&#x3057;&#x3066;&#x306F;</td><td align="left" valign="top">platelets/&#x8840;&#x5C0F;&#x677F;</td><td align="left" valign="top">None</td></tr><tr><td align="left" valign="top">Hb 8.1g/dL and moderate nutritional disorder were observed/Hb8.1g/dl&#x3068;&#x4E2D;&#x7A0B;&#x5EA6;&#x306E;&#x6804;&#x990A;&#x969C;&#x5BB3;&#x3092;&#x8A8D;&#x3081;&#x305F;</td><td align="left" valign="top">Hb8</td><td align="left" valign="top">nutritional disorder/&#x6804;&#x990A;&#x969C;&#x5BB3;</td></tr><tr><td align="left" valign="top">The drainage volume was excessive/&#x6392;&#x6DB2;&#x91CF;&#x304C;&#x904E;&#x591A;&#x3067;&#x3042;&#x3063;&#x305F;</td><td align="left" valign="top">excessive/&#x591A;</td><td align="left" valign="top">excessive drainage volume/&#x6392;&#x6DB2;&#x91CF;&#x304C;&#x904E;&#x591A;</td></tr></tbody></table></table-wrap></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations. First, due to the annotation cost, our evaluation focused exclusively on disease name recognition. While disease entities are fundamental to clinical NLP tasks, real-world applications often require the extraction of a wider range of entities such as medications, procedures, and laboratory findings. Future research should expand the scope of entity types to provide a more comprehensive evaluation of model capabilities in diverse clinical information extraction tasks.</p><p>Second, we did not evaluate the computational efficiency or resource demands of the models. This is particularly relevant for LLMs, which often require substantial computational resources during both training and inference. Future studies should include a systematic comparison of computational cost, memory usage, and inference latency to guide more practical model deployment in clinical environments.</p><p>Lastly, we did not include comparisons with models pretrained on large-scale medical corpora, such as Bio-BERT [<xref ref-type="bibr" rid="ref27">27</xref>] or its Japanese counterpart [<xref ref-type="bibr" rid="ref28">28</xref>]. These models may have inherent advantages in understanding domain-specific terminology and context, and their inclusion could provide a clearer upper bound on MLM-based methods. Future work may incorporate medical-specific, pretrained models and a broader range of domain adaptation techniques in Japanese clinical settings.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study evaluated the performance of fine-tuned LLMs on disease name recognition in Japanese clinical notes, with a focus on both ID and OOD robustness. Our results demonstrate that fine-tuned LLMs, specifically the fine-tuned LLaMA-3.1, consistently outperforms the strong baselines across OOD settings, demonstrating superior robustness to previously unseen clinical notes.</p><p>Stratified analyses revealed that the LLM exhibits greater robustness to stylistic variation among physicians, as reflected in its lower performance variance across physicians. However, variations across clinical cases continue to pose significant challenges, with both LLM and the baseline model showing considerable fluctuations. Error analysis highlighted the LLM&#x2019;s ability to consistently capture complete entity spans in stylistically diverse clinical notes. Its generative approach enables more context-aware span prediction, contributing to stable performance over the baseline model.</p><p>Overall, our findings underscore the potential of fine-tuned LLMs for clinical named entity recognition in low-resource languages such as Japanese, particularly in contexts with considerable variation in writing style. Nevertheless, challenges in cross-clinical case robustness remain. Future work should explore more targeted domain adaptation techniques and integration of external medical knowledge to further enhance robustness in real-world clinical NLP applications.</p></sec></sec></body><back><ack><p>This work was supported by Cross-ministerial Strategic Innovation Promotion Program (SIP) on &#x201C;Integrated Health Care System&#x201D; (Grant Number JPJ012425) and JST CREST (Grant Number: JPMJCR22N1), Japan.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb2">EHR</term><def><p>electronic health records</p></def></def-item><def-item><term id="abb3">LLMs</term><def><p>large language models</p></def></def-item><def-item><term id="abb4">ID</term><def><p>in-domain</p></def></def-item><def-item><term id="abb5">OOD</term><def><p>out-of-domain</p></def></def-item><def-item><term id="abb6">CR</term><def><p>case reports</p></def></def-item><def-item><term id="abb7">PN</term><def><p>progress notes</p></def></def-item><def-item><term id="abb8">MLM</term><def><p>masked language model</p></def></def-item><def-item><term id="abb9">ICL</term><def><p>in-context learning</p></def></def-item><def-item><term id="abb10">NER</term><def><p>named entity recognition</p></def></def-item><def-item><term id="abb11">BIO</term><def><p>beginning inside outside</p></def></def-item><def-item><term id="abb12">LoRA</term><def><p>low-rank adaptation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zweigenbaum</surname><given-names>P</given-names> </name><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>KB</given-names> </name></person-group><article-title>Frontiers of biomedical text mining: current progress</article-title><source>Brief Bioinform</source><year>2007</year><month>09</month><volume>8</volume><issue>5</issue><fpage>358</fpage><lpage>375</lpage><pub-id pub-id-type="doi">10.1093/bib/bbm045</pub-id><pub-id pub-id-type="medline">17977867</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rastegar-Mojarad</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Clinical information extraction applications: a literature review</article-title><source>J Biomed Inform</source><year>2018</year><month>01</month><volume>77</volume><fpage>34</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2017.11.011</pub-id><pub-id pub-id-type="medline">29162496</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kreimeyer</surname><given-names>K</given-names> </name><name name-style="western"><surname>Foster</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Natural language processing systems for capturing and standardizing unstructured clinical information: a systematic review</article-title><source>J Biomed Inform</source><year>2017</year><month>09</month><volume>73</volume><fpage>14</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2017.07.012</pub-id><pub-id pub-id-type="medline">28729030</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>W</given-names> </name><name name-style="western"><surname>Rumshisky</surname><given-names>A</given-names> </name><name name-style="western"><surname>Uzuner</surname><given-names>O</given-names> </name></person-group><article-title>Evaluating temporal relations in clinical text: 2012 i2b2 challenge</article-title><source>J Am Med Inform Assoc</source><year>2013</year><month>09</month><volume>20</volume><issue>5</issue><fpage>806</fpage><lpage>813</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-001628</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Henry</surname><given-names>S</given-names> </name><name name-style="western"><surname>Buchan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Filannino</surname><given-names>M</given-names> </name><name name-style="western"><surname>Stubbs</surname><given-names>A</given-names> </name><name name-style="western"><surname>Uzuner</surname><given-names>O</given-names> </name></person-group><article-title>2018 n2c2 shared task on adverse drug events and medication extraction in electronic health records</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>01</month><day>1</day><volume>27</volume><issue>1</issue><fpage>3</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocz166</pub-id><pub-id pub-id-type="medline">31584655</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bose</surname><given-names>P</given-names> </name><name name-style="western"><surname>Srinivasan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sleeman</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Palta</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>P</given-names> </name></person-group><article-title>A survey on recent named entity recognition and relationship extraction techniques on clinical texts</article-title><source>Appl Sci (Basel)</source><year>2021</year><month>01</month><volume>11</volume><issue>18</issue><fpage>8319</fpage><pub-id pub-id-type="doi">10.3390/app11188319</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kawazoe</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shimamoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Seki</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Post-marketing surveillance of anticancer drugs using natural language processing of electronic medical records</article-title><source>NPJ Digit Med</source><year>2024</year><month>11</month><day>9</day><volume>7</volume><issue>1</issue><fpage>315</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01323-1</pub-id><pub-id pub-id-type="medline">39521935</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Naguib</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tannier</surname><given-names>X</given-names> </name><name name-style="western"><surname>N&#x00E9;v&#x00E9;ol</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Al-Onaizan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bansal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YN</given-names> </name></person-group><article-title>Few-shot clinical entity recognition in English, French and Spanish: masked language models outperform generative model prompting</article-title><year>2024</year><conf-name>Findings of the Association for Computational Linguistics: EMNLP 2024</conf-name><conf-date>Nov 12-16, 2024</conf-date><conf-loc>Miami, Florida, USA</conf-loc><fpage>6829</fpage><lpage>6852</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.findings-emnlp.400</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Improving large language models for clinical named entity recognition via prompt engineering</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1812</fpage><lpage>1820</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad259</pub-id><pub-id pub-id-type="medline">38281112</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Herman Bernardim Andrade</surname><given-names>G</given-names> </name><name name-style="western"><surname>Nishiyama</surname><given-names>T</given-names> </name><name name-style="western"><surname>Fujimaki</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Assessing domain adaptation in adverse drug event extraction on real-world breast cancer records</article-title><source>Int J Med Inform</source><year>2024</year><month>11</month><volume>191</volume><fpage>105539</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105539</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Laparra</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>TA</given-names> </name></person-group><article-title>Rethinking domain adaptation for machine learning over clinical language</article-title><source>JAMIA Open</source><year>2020</year><month>07</month><volume>3</volume><issue>2</issue><fpage>146</fpage><lpage>150</lpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooaa010</pub-id><pub-id pub-id-type="medline">32734151</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Keloth</surname><given-names>VK</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Advancing entity recognition in biomedicine via instruction tuning of large language models</article-title><source>Bioinformatics</source><year>2024</year><month>03</month><day>29</day><volume>40</volume><issue>4</issue><fpage>btae163</fpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btae163</pub-id><pub-id pub-id-type="medline">38514400</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zuo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Information extraction from clinical notes: are we ready to switch to large language models?</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 7, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2411.10020</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Burstein</surname><given-names>J</given-names> </name><name name-style="western"><surname>Doran</surname><given-names>C</given-names> </name><name name-style="western"><surname>Solorio</surname><given-names>T</given-names> </name></person-group><article-title>Pre-training of deep bidirectional transformers for language understanding</article-title><year>2019</year><conf-name>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name><conf-date>Jun 2-7, 2019</conf-date><conf-loc>Minneapolis, Minnesota, USA</conf-loc><fpage>4171</fpage><lpage>4186</lpage><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yada</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wakamiya</surname><given-names>S</given-names> </name><name name-style="western"><surname>Aramaki</surname><given-names>E</given-names> </name></person-group><article-title>REAL-mednlp: overview of REAL document-based medical natural language processing task</article-title><access-date>2025-06-30</access-date><conf-name>NTCIR 16 Conference: Proceedings of the 16th NTCIR Conference on Evaluation of Information Access Technologies</conf-name><conf-date>Jun 14-17, 2022</conf-date><conf-loc>Tokyo Japan</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://research.nii.ac.jp/ntcir/workshop/OnlineProceedings16/pdf/ntcir/01-NTCIR16-OV-MEDNLP-YadaS.pdf">https://research.nii.ac.jp/ntcir/workshop/OnlineProceedings16/pdf/ntcir/01-NTCIR16-OV-MEDNLP-YadaS.pdf</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yada</surname><given-names>S</given-names> </name><name name-style="western"><surname>Joh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tanaka</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>F</given-names> </name><name name-style="western"><surname>Aramaki</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kurohashi</surname><given-names>S</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Calzolari</surname><given-names>N</given-names> </name><name name-style="western"><surname>B&#x00E9;chet</surname><given-names>F</given-names> </name><name name-style="western"><surname>Blache</surname><given-names>P</given-names> </name><name name-style="western"><surname>Choukri</surname><given-names>K</given-names> </name><name name-style="western"><surname>Cieri</surname><given-names>C</given-names> </name><name name-style="western"><surname>Declerck</surname><given-names>T</given-names> </name><name name-style="western"><surname>Goggi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Isahara</surname><given-names>H</given-names> </name><name name-style="western"><surname>Maegaard</surname><given-names>B</given-names> </name><name name-style="western"><surname>Mariani</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mazo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Moreno</surname><given-names>A</given-names> </name><name name-style="western"><surname>Odijk</surname><given-names>J</given-names> </name><name name-style="western"><surname>Piperidis</surname><given-names>S</given-names> </name></person-group><article-title>Towards a versatile medical-annotation guideline feasible without heavy medical knowledge: starting from critical lung diseases</article-title><year>2020</year><access-date>2025-04-13</access-date><conf-name>Proceedings of the Twelfth Language Resources and Evaluation Conference</conf-name><conf-date>May 11-16, 2020</conf-date><conf-loc>Marseille, France</conf-loc><fpage>4565</fpage><lpage>4572</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2020.lrec-1.561/">https://aclanthology.org/2020.lrec-1.561/</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Tohoku-nlp/bert-base-japanese-v3</article-title><source>Hugging Face</source><year>2024</year><access-date>2025-04-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/tohoku-nlp/bert-base-japanese-v3">https://huggingface.co/tohoku-nlp/bert-base-japanese-v3</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Fujii</surname><given-names>K</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Loem</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Continual pre-training for cross-lingual LLM adaptation: enhancing Japanese language capabilities</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 27, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.17790</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Nishiyama</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nishidani</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ando</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yada</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wakamiya</surname><given-names>S</given-names> </name><name name-style="western"><surname>Aramaki</surname><given-names>E</given-names> </name></person-group><article-title>NAISTSOC at the NTCIR-16 real-mednlp task</article-title><year>2022</year><access-date>2025-02-03</access-date><conf-name>NTCIR 16 Conference: Proceedings of the 16th NTCIR Conference on Evaluation of Information Access Technologies</conf-name><conf-date>Jun 14-17, 2022</conf-date><conf-loc>Tokyo Japan</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://research.nii.ac.jp/ntcir/workshop/OnlineProceedings16/pdf/ntcir/07-NTCIR16-MEDNLP-NishiyamaT.pdf">https://research.nii.ac.jp/ntcir/workshop/OnlineProceedings16/pdf/ntcir/07-NTCIR16-MEDNLP-NishiyamaT.pdf</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 16, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 Technical Report</article-title><source>arXiv</source><year>2024</year><month>03</month><day>4</day><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sahoo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>V</given-names> </name><name name-style="western"><surname>Mondal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chadha</surname><given-names>A</given-names> </name></person-group><article-title>A systematic survey of prompt engineering in large language models: techniques and applications</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 16, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.07927</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ye</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Ahmed</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pryzant</surname><given-names>R</given-names> </name><name name-style="western"><surname>Khani</surname><given-names>F</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ku</surname><given-names>LW</given-names> </name><name name-style="western"><surname>Martins</surname><given-names>A</given-names> </name><name name-style="western"><surname>Srikumar</surname><given-names>V</given-names> </name></person-group><article-title>Prompt engineering a prompt engineer</article-title><year>2024</year><conf-name>Findings of the Association for Computational Linguistics ACL 2024</conf-name><conf-date>Aug 11-16, 2024</conf-date><conf-loc>Bangkok, Thailand and virtual meeting</conf-loc><fpage>355</fpage><lpage>385</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.21</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Langren&#x00E9;</surname><given-names>N</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>S</given-names> </name></person-group><article-title>Unleashing the potential of prompt engineering for large language models</article-title><source>Patterns</source><year>2025</year><month>06</month><volume>6</volume><issue>6</issue><fpage>101260</fpage><pub-id pub-id-type="doi">10.1016/j.patter.2025.101260</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Prompt engineering in consistency and reliability with the evidence-based guideline for LLMs</article-title><source>NPJ Digit Med</source><year>2024</year><month>02</month><day>20</day><volume>7</volume><issue>1</issue><fpage>41</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01029-4</pub-id><pub-id pub-id-type="medline">38378899</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><volume>36</volume><issue>4</issue><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kawazoe</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shibata</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shinohara</surname><given-names>E</given-names> </name><name name-style="western"><surname>Aramaki</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ohe</surname><given-names>K</given-names> </name></person-group><article-title>A clinical specific BERT developed using a huge Japanese clinical text corpus</article-title><source>PLOS ONE</source><year>2021</year><volume>16</volume><issue>11</issue><fpage>e0259763</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0259763</pub-id><pub-id pub-id-type="medline">34752490</pub-id></nlm-citation></ref></ref-list></back></article>