<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e81500</article-id><article-id pub-id-type="doi">10.2196/81500</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Model Automated Extraction of Clinical Signs and Symptoms From Emergency Department Reports for Machine Learning Prediction Models: Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Schipper</surname><given-names>Anoeska</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Belgers</surname><given-names>Peter</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>O'Connor</surname><given-names>Rory David</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>van de Wouw</surname><given-names>Lieke</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Builtjes</surname><given-names>Luc</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bosma</surname><given-names>Joeran S</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kusters</surname><given-names>Ron</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kurstjens</surname><given-names>Steef</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rutten</surname><given-names>Matthieu</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>van Ginneken</surname><given-names>Bram</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Diagnostic Image Analysis Group, Medical Imaging Department, Radboud University Medical Center</institution><addr-line>Geert Grooteplein Zuid 10</addr-line><addr-line>Nijmegen</addr-line><country>The Netherlands</country></aff><aff id="aff2"><institution>Laboratory of Clinical Chemistry and Hematology, Jeroen Bosch Ziekenhuis</institution><addr-line>Den Bosch</addr-line><country>The Netherlands</country></aff><aff id="aff3"><institution>Psychiatry Department, Radboud University Medical Center</institution><addr-line>Nijmegen</addr-line><country>The Netherlands</country></aff><aff id="aff4"><institution>Emergency Department, Jeroen Bosch Ziekenhuis</institution><addr-line>Den Bosch</addr-line><country>The Netherlands</country></aff><aff id="aff5"><institution>Department of Health Technology and Services Research, Technical Medical Centre, University of Twente</institution><addr-line>Enschede</addr-line><country>The Netherlands</country></aff><aff id="aff6"><institution>Laboratory of Clinical Chemistry and Laboratory Medicine, Dicoon BV, Canisius-Wilhelmina Ziekenhuis</institution><addr-line>Nijmegen</addr-line><country>The Netherlands</country></aff><aff id="aff7"><institution>Medical Imaging Department, Jeroen Bosch Ziekenhuis</institution><addr-line>Den Bosch</addr-line><country>The Netherlands</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Xu</surname><given-names>He</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Huan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Luo</surname><given-names>Rui</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Bram van Ginneken, PhD, Diagnostic Image Analysis Group, Medical Imaging Department, Radboud University Medical Center, Geert Grooteplein Zuid 10, Nijmegen, 6525 GA, The Netherlands, 31 614021323; <email>bramvanginneken@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>30</day><month>4</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e81500</elocation-id><history><date date-type="received"><day>30</day><month>07</month><year>2025</year></date><date date-type="rev-recd"><day>06</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>06</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Anoeska Schipper, Peter Belgers, Rory David O'Connor, Lieke van de Wouw, Luc Builtjes, Joeran S Bosma, Ron Kusters, Steef Kurstjens, Matthieu Rutten, Bram van Ginneken. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 30.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e81500"/><abstract><sec><title>Background</title><p>Most clinically relevant information in emergency department (ED) visits is documented in free text, limiting reuse for research and clinical decision support. Despite growing interest in large language model (LLM)&#x2013;based feature extraction, very few studies have examined it directly on ED reports. Existing work has mainly addressed binary tasks and rarely evaluated their impact on downstream prediction models. Furthermore, evidence for small multilingual LLMs remains limited, especially for underrepresented languages such as Dutch. Locally deployable LLMs could enable automated feature extraction for decision support systems without increasing physician workload.</p></sec><sec><title>Objective</title><p>We aim to evaluate whether a small open-source LLM (Qwen 2.5:14B) can automatically extract 16 clinical signs and symptoms from ED reports and use these as input for an appendicitis prediction model. LLM performance under minimal and optimized 0-shot prompts was assessed against researcher annotations (reference standard) and physician annotations.</p></sec><sec sec-type="methods"><title>Methods</title><p>This retrospective study used 336 ED reports from patients presenting with acute abdominal pain to a Dutch teaching hospital (2016-2023). One hundred reports were randomly selected to develop a minimal and an optimized 0-shot prompt strategy. The remaining 236 reports, reserved for evaluation, were annotated by 2 ED physicians and processed by the LLM to extract 16 signs and symptoms, covering binary, multiclass, and multilabel classification tasks. These features were used as input to the HIVE (History, Intake, Vitals, Examination) appendicitis prediction model. LLM extraction accuracy, sensitivity, and specificity were measured against the researcher&#x2019;s (reference standard) and physician annotations. The HIVE model&#x2019;s area under the receiver operating characteristic curve was evaluated using LLM-extracted vs physician-annotated features.</p></sec><sec sec-type="results"><title>Results</title><p>Among 336 ED reports from patients with acute abdominal pain (median age 41, IQR 22&#x2010;62 years, 205/336, 61% female), 50% (167/336) had appendicitis. The LLM achieved weighted average accuracies of 0.910 (95% CI (0.018) with minimal prompts and 0.929 (95% CI &#x00B1;0.016) with optimized prompts, vs 0.961 (95% CI &#x00B1;0.012) and 0.951 (95% CI &#x00B1;0.015) for physicians. Corresponding HIVE model area under the receiver operating characteristic curves were 0.871 (95% CI &#x00B1;0.019) and 0.911 (95% CI &#x00B1;0.014) with LLM inputs under the minimal and optimized prompts, compared to 0.917 (95% CI &#x00B1;0.015) and 0.924 (95% CI &#x00B1;0.018) for physician inputs.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>A small locally deployable multilingual LLM can approach physician-level accuracy in extracting structured binary, multiclass, and multilabel clinical data from free-text Dutch ED reports, while preserving patient privacy, interpretability, and statistical transparency for downstream diagnostic modeling.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>natural language processing</kwd><kwd>electronic health records</kwd><kwd>machine learning</kwd><kwd>predictive modeling</kwd><kwd>health informatics</kwd><kwd>emergency medicine</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In the emergency department (ED), clinical information is primarily documented as free-text. While this documentation style aligns with routine clinical practice, it presents challenges for reusing data in research and decision support applications [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Structured templates in the electronic health record (EHR) system offer a common workaround, but impose extra workload on already time-pressed ED physicians and are rarely adopted in practice [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Automated feature extraction using natural language processing (NLP) offers a more sustainable alternative. To date, extracting clinical data from ED reports has primarily relied on long short-term memory and BERT (Bidirectional Encoder Representations From Transformers) models, which perform reasonably well but require extensive preprocessing, large manually annotated corpora, and model retraining [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Generative pretrained large language models (LLMs) present an alternative, as they can be optimized using only prompting, making them more accessible for downstream tasks. While these LLMs have proved effective in extracting diverse features from radiology and pathology reports [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>], their use in automated feature extraction from ED reports is limited. Unlike radiology or pathology reports, ED reports contain extensive abbreviations, succinct phrasing, and domain-specific terminology, presenting unique challenges to NLP. Recent work has begun to explore LLM-based information extraction in emergency medicine. McMurry et al [<xref ref-type="bibr" rid="ref14">14</xref>] demonstrated LLM-based identification of binary respiratory symptoms using 0-shot prompting and multicenter validation, while Bejan et al [<xref ref-type="bibr" rid="ref15">15</xref>] phenotyped symptomatic kidney-stone presentations using a range of prompting and fine-tuning strategies. Gao et al [<xref ref-type="bibr" rid="ref16">16</xref>] used LLM-generated severity scores, supported by in-context learning and retrieval-augmented generation, to improve early triage predictions. Collectively, recent studies demonstrate the potential of LLM-based feature extraction from ED reports. However, these efforts primarily addressed binary outcomes or limited symptom sets, or did not investigate the downstream impact on diagnostic model performance. Prior work also relied mainly on large LLMs or Chinese bilingual LLMs, leaving open whether smaller multilingual LLMs can perform reliably in underrepresented languages [<xref ref-type="bibr" rid="ref17">17</xref>]. This is particularly relevant for the Dutch, where the inherent complexity of ED reports is compounded by limited representation in LLM training corpora.</p><p>This study addresses these gaps by evaluating whether a small multilingual LLM (Qwen 2.5:14B) can automatically extract binary, multiclass, and multilabel clinical features from Dutch ED reports and provide reliable inputs for a downstream prediction model. Our focus is a clinically relevant use case: acute abdominal pain (AAP). Extracted features are used as input for the HIVE (History, Intake, Vitals, Examination) model, a previously established appendicitis prediction model based on 16 contributing clinical signs and symptoms [<xref ref-type="bibr" rid="ref18">18</xref>]. These features were originally annotated manually, a process that is labor-intensive and difficult to scale. This study evaluates whether an LLM can automate this extraction and produce comparable inputs by developing and validating two 0-shot prompting strategies. Secondary outcomes included comparison to physician annotations and assessing the appendicitis prediction model using LLM-extracted vs physician-annotated features. We hypothesize that, with appropriate prompting, a small locally run multilingual LLM could achieve near-expert accuracy in feature extraction and maintain comparable predictive models&#x2019; performance, supporting the feasibility of a scalable, privacy-preserving workflow for decision support systems at the ED.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Collection</title><p>Data from 350 patients with AAP were retrospectively collected at Jeroen Bosch Hospital, a Dutch teaching hospital, between July 2016 and January 2023. Patient inclusion criteria and cohort construction are detailed in our previous study [<xref ref-type="bibr" rid="ref18">18</xref>]. This dataset included 167 appendicitis cases with 169 other AAP presentations. Among other AAP presentations, those suspected of appendicitis were balanced with those having nonspecific or other AAP causes based on initial ED assessments by triage nurses or referring physicians. Cases lacking sufficient medical history or physical examination data or missing more than 70% (n=14) of vital signs were excluded, resulting in 336 cases. Each case contained triage and intake data, vital signs, and the medical history and physical examination sections. Further cohort details are in Supplemental Tables 1A-E of our previous publication [<xref ref-type="bibr" rid="ref18">18</xref>]. No exclusions were made based on age, comorbidities, medication use, or symptom presentation. Data were extracted and pseudonymized using CTcue (IQVIA Nederland B.V.).</p></sec><sec id="s2-2"><title>Study Design</title><p>All 336 ED reports were independently annotated by 2 lead researchers to establish the reference standard for 16 clinical signs and symptoms, as previously described [<xref ref-type="bibr" rid="ref18">18</xref>] (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Interrater agreement, assessed on a random sample of 80 reports, demonstrated high reliability, with average Krippendorff &#x03B1; values of 0.93 for binary features and 0.95 for the multiclass feature, and an average Jaccard similarity of 0.76 for multilabel features. The 16 features were derived from our previous HIVE model, in which XGBoost (Extreme Gradient Boosting) and SHAP (Shapley Additive Explanations) analyses were used to identify features contributing more than 95% of the model&#x2019;s predictive output (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Their discriminative performance was previously assessed using the area under the receiver operating characteristic curve (AUROC) to determine the optimal feature set. The final selection comprised 11 medical history and 5 physical examination features, covering 8 binary, 1 multiclass, and 7 multilabel outputs.</p><p>LLM-based feature extraction using a 0-shot prompting strategy was chosen over conventional tokenization-based text classification because the goal was to extract clinically meaningful variables and integrate them with structured vital signs and intake information directly available from the EHR. Traditional token-based NLP methods focus on identifying or classifying text spans and cannot reliably capture negation, multilabel outputs, or nuanced clinical context, and typically provide token-level outputs that obscure the contribution of specific clinical findings. This reduces interpretability and alters feature-importance patterns (eg, via SHAP), limiting transparency and alignment with the structured design of the HIVE model.</p><p>For the present analysis, 100 reports were used to optimize two 0-shot prompting strategies for extracting these features using an LLM. Zero-shot prompting directly instructs the LLM without examples [<xref ref-type="bibr" rid="ref19">19</xref>]. The remaining 236 reports were reserved to evaluate manual labeling by 2 ED physicians and automated extraction by the LLM (<xref ref-type="fig" rid="figure1">Figure 1A</xref>).</p><p>These reports were also used for the HIVE appendicitis prediction model. Of the 336 reports, 268 had previously been labeled by the lead researchers and were used for model training and tuning. The remaining 68 reports, drawn from the 236 physician-labeled and LLM-processed reports for evaluation, served as 4 identical validation sets (<xref ref-type="fig" rid="figure1">Figure 1B</xref>). This validation set was selected before model development, contained no cases used in LLM prompt construction or hyperparameter tuning, and matched the validation size used in our previous study [<xref ref-type="bibr" rid="ref18">18</xref>]. This ensured a fully independent and consistent benchmark for assessing the effect of LLM-based feature extraction on downstream model performance. Given the modest dataset size (n=336) and single-center design, this study aimed to evaluate feasibility rather than generalizable performance. The results should therefore be interpreted as proof-of-concept.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study design overview comparing feature extraction strategies for the HIVE appendicitis prediction model. (A) Flowchart illustrating the experimental setup for comparing automated (LLM-based) and manual (physician-labeled) feature extraction strategies. All four experiments were conducted on the same set of 236 evaluation cases. (B) Flowchart outlining the HIVE model development and validation process. The model was trained and tuned on a separate, manually labeled development set (n=268), and validated on 68 cases drawn from the same pool of 236 cases used in panel (A). The same trained model was used across all 4 validation experiments; only the feature extraction method varied for the identical set of 68 validation cases. ED: emergency department; HIVE: History, Intake, Vitals, Examination; LLM: large language model; ML: machine learning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81500_fig01.png"/></fig></sec><sec id="s2-3"><title>Workflow and Prompting Strategies</title><p>The primary objective of this study was to evaluate the LLM&#x2019;s automated extraction of 16 clinical signs and symptoms from ED reports, all of which contributed to our ML (HIVE) model (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). To that end, a 4-step workflow was implemented (<xref ref-type="fig" rid="figure2">Figure 2</xref>). First, ED reports were compiled into a JSON file. Next, a base prompt was designed, consisting of an instruction specifying which symptom or sign to extract and an output with a predefined set of answer options. From this base prompt, two 0-shot prompting strategies were derived. The minimal prompt strategy extended the base prompt with a limited set of essential clarifications, identical to the annotation rules provided to ED physicians to ensure equal comparison. The optimized prompt strategy further integrated additional elements aimed at steering performance, including broader context (artificial intelligence [AI] persona and report type), additional instructions (section limitations, and negations or constraints of symptoms), and specific context (domain-specific terminology or abbreviations, and symptom explanations). To quantify the contribution of individual elements, an ablation study was conducted at the start of the study on a development set of 100 ED reports. Each element was sequentially added to the base prompt, and performance was compared against the researcher&#x2019;s annotations. Only elements that improved performance were included in the optimized prompt. Both prompts used the same annotation rules; any differences between them are due to the prompt design, not to different instructions. All prompts and reports were provided to the LLM in Dutch. The LLM processed each report from the development (n=100) and evaluation sets (n=236), producing structured output in JSON format. The JSON format consisted of a pseudonymized patient identifier, the name of the target symptom, and the predefined answer options of the target symptom (eg, {&#x201C;uid&#x201D;: &#x201C;82547948A6A5E91B93B2BAACAE3F943508FD7EFA,&#x201D; &#x201C;abdominal pain location&#x201D;: [&#x201C;right lower quadrant,&#x201D; &#x201C;diffuse&#x201D;]}). Third, features extracted from 68 of 236 evaluation reports were combined with structured intake data and vital signs. Finally, these datasets were used for ML model validation. The remaining 268 cases, annotated by the research team, were used for developing the ML model.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Modular workflow for extracting clinical signs and symptoms using a local LLM, followed by appendicitis prediction. ED reports in JSON format (1) were processed using a base prompt (0-shot) (2A) containing an instruction and predefined output format. From this base, two prompting strategies were derived (2B): a minimal prompt, which included essential clarifications also provided to ED physicians for equal comparison, and an optimized prompt, integrating additional elements that were sequentially evaluated through an ablation study. Feature outputs from both strategies were combined with ED intake and vital-sign data (3) and entered into the HIVE ML model to predict appendicitis (4). AI: artificial intelligence; ED: emergency department; HIVE: History, Intake, Vitals, Examination; LLM: large language model; ML: machine learning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81500_fig02.png"/></fig></sec><sec id="s2-4"><title>LLM Selection</title><p>In December 2024 and January 2025, four popular LLMs available on the Ollama platform were evaluated using 4 representative features varying in complexity and output type. Selected models, Mistral-Nemo 12B Instruct [<xref ref-type="bibr" rid="ref20">20</xref>], Qwen2.5:14B [<xref ref-type="bibr" rid="ref21">21</xref>], DeepSeek R1:14B distilled from Qwen2.5 [<xref ref-type="bibr" rid="ref22">22</xref>], and Gemma2:9B [<xref ref-type="bibr" rid="ref23">23</xref>], were chosen for their multilingual capabilities, input length, and compact size (&#x003C;10 GB), allowing deployment on a 12 GB GPU laptop without requiring connection to a computational cluster, which may be impractical in ED settings.</p><p>After evaluation, Alibaba&#x2019;s Qwen2.5:14B Instruct was selected based on performance on the development set (n=100). All LLMs were assessed on the same data, prompts, and settings (eg, temperature=0.1), supporting deterministic output [<xref ref-type="bibr" rid="ref24">24</xref>]. Ollama was chosen for its capability to facilitate local LLM deployment. The underlying LLM extraction tool for this study is available in a repository (10.6084/m9.figshare.28931030) and on GitHub [<xref ref-type="bibr" rid="ref25">25</xref>] with a complete walkthrough to adopt the framework and tooling, and incorporates components from a previously published data extraction repository [<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec><sec id="s2-5"><title>Postprocessing of LLM Outputs</title><p>LLM outputs were checked against the predefined answer options. To mimic daily practice, where manual postprocessing is infeasible, responses outside these options were removed. Occasionally, the LLM returned descriptive terms accurately copied from the ED report but not listed among the predefined answers. These nonconforming terms were removed, and the feature was labeled as &#x201C;not reported.&#x201D; All performance metrics presented reflect this postprocessing step.</p></sec><sec id="s2-6"><title>Reader Study&#x2013;Manual Labeling</title><p>A reader study was conducted to compare LLM outputs, under the minimal and optimized prompt strategies, with manual labeling of 2 ED physicians using the same evaluation set (n=236). Each ED report was presented in its original format and independently reviewed by 2 ED physicians, one with 2 years of residency experience and one with 5 years of postqualification experience. Physicians labeled 16 clinical signs and symptoms using an Excel (Microsoft Corp) spreadsheet. Predefined answer options matched those given to the LLM and were provided as drop-down menus: single-choice for binary and multiclass features, and multiple-choice for multilabel features. If a sign or symptom was absent or not reported, the field was left blank.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>LLM- and physician-labeled features were evaluated using specificity, sensitivity, and accuracy. The HIVE model&#x2019;s performance was assessed via AUROCs. For multiclass and multilabel features, LLM performance metrics were calculated for each positive class and averaged over all positive classes. 95% CIs were estimated through bootstrapping with replacement.</p></sec><sec id="s2-8"><title>ML Model Development and Validation</title><p>An XGBoost (version 2.1.1) algorithm was selected to estimate the probability of appendicitis, following the approach described in our previous study [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Referred to as the HIVE model, it incorporates ED intake information, vital signs, medical history, and physical examination features. As explained in this study&#x2019;s design, only routinely measured vital signs and intake information, combined with medical history and physical examination features that contributed most to our previous HIVE model, were included in this model. Following the same set-up, 336 cases were divided into 268 cases for training or tuning (researcher-labeled), and 68 validation cases relabeled by the LLM, under 2 prompting strategies, and by 2 ED physicians. This allowed comparison against the original reference annotations.</p><p>For model tuning, repeated stratified 10-fold cross-validation was used to preserve class distributions. Categorical parameters were encoded using CatBoost (version 2.6.3) into numerical representations derived from training data [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Hyperparameters were optimized to maximize AUROC using Bayesian optimization with Optuna (version 3.6.1) over 100 trials [<xref ref-type="bibr" rid="ref30">30</xref>]. The optimized hyperparameters were a learning rate (&#x03B7;) of 0.012, a maximum tree depth of 3, a minimum sum of weights in a child of 4, a subsample ratio of features for each tree of 0.64, and 156 boosting rounds. Class weights were not adjusted, given that the dataset was approximately balanced (167 appendicitis vs 169 other AAP causes). Feature contributions to model predictions were quantified using TreeSHAP (version 0.46.0). To examine whether downstream performance depended on model choice, we trained a random forest on the same 236 researcher-labeled cases and evaluated it on the same 68-case validation set using the LLM-extracted features. Preprocessing matched the HIVE pipeline, with numeric missing values imputed using an Iterative Imputer (max_iter=10) for compatibility with the random forest model. A standard configuration was used (n_estimators=500, max_depth=None, bootstrap=True). The TRIPOD (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis) checklist was followed to ensure methodological transparency (<xref ref-type="supplementary-material" rid="app7">Checklist 1</xref>), and all code including the LLM extraction tool are available via figshare (10.6084/m9.figshare.28931030).</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>This study was conducted according to the Declaration of Helsinki and Guidelines for Good Clinical Practice. The execution of this retrospective observational study of patient records was approved by the local review board of the Jeroen Bosch Hospital (number 2023.11.22.02), and was judged by the Medical Research Ethics Committee Brabant, which waived this study to be subject to the regulations of the Dutch Medical Research Involving Human Subjects Act, including a waiver of informed written consent and a consent for publication (Medical Research Ethics Committee number NW2024-05) [<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>Data were extracted from the EHR using CTcue (IQVIA) and handled in a pseudonymized manner. No directly identifiable patient information was accessed by the researchers. Data extraction and analysis were performed in a secure environment in accordance with institutional and European and national data protection regulations. Participants received no compensation.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>Among 336 patients with AAP (median age 41, IQR 22&#x2010;62 years, 205/336, 61% were female), 50% (167/336) had appendicitis (<xref ref-type="table" rid="table1">Table 1</xref>, <xref ref-type="table" rid="table2">Table 2</xref>). Reports were authored by various physicians over 7 years. Feature prevalence ranged from 6% (n=20) for pollakiuria to 91% (n=303) for pain location (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Categorical patient characteristics (n=336). Structured EHR<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> data (intake and vital signs).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Section</td><td align="left" valign="bottom">Class</td><td align="left" valign="bottom">Values, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Sex</td><td align="left" valign="top">Intake</td><td align="left" valign="top">Female</td><td align="left" valign="top">205 (61.0)</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Transport</td><td align="left" valign="top" colspan="2">Intake</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Ambulance</td><td align="left" valign="top">32 (9.5)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Own transport</td><td align="left" valign="top">303 (90.2)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Other</td><td align="left" valign="top">1 (0.0)</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Referrer</td><td align="left" valign="top" colspan="2">Intake</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Primary care physician</td><td align="left" valign="top">219 (65.2)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Self-referral</td><td align="left" valign="top">98 (29.2)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Hospital</td><td align="left" valign="top">9 (2.7)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Ambulance</td><td align="left" valign="top">5 (1.5)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Other facility</td><td align="left" valign="top">1 (0.0)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Not reported</td><td align="left" valign="top">4 (1.2)</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">EMV<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">Vital signs</td><td align="left" valign="top"/><td align="left" valign="top">201 (59.8)</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Q-SOFA<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">Vital signs</td><td align="left" valign="top"/><td align="left" valign="top">103 (30.7)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>EHR: electronic health record. </p></fn><fn id="table1fn2"><p><sup>b</sup>EMV: Eye Opening, Best Motor Response, Best Verbal Response.</p></fn><fn id="table1fn3"><p><sup>c</sup>Q-SOFA: quick sepsis-related organ failure assessment</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Numeric patient characteristics (n=336). Structured EHR<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> data (intake and vital signs).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Section</td><td align="left" valign="bottom">Median (IQR)</td><td align="left" valign="bottom">Values, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Age</td><td align="left" valign="top">Intake</td><td align="left" valign="top">41 (22&#x2010;62)</td><td align="left" valign="top">336 (100.0)</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Prior ED<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> visits with AAP<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">Intake</td><td align="left" valign="top">1 (1-1)</td><td align="left" valign="top">336 (100.0)</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Pain rating</td><td align="left" valign="top">Intake</td><td align="left" valign="top">6 (4-8)</td><td align="left" valign="top">291 (86.6)</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Diastolic arterial pressure (mm Hg)</td><td align="left" valign="top">Vital signs</td><td align="left" valign="top">79 (71&#x2010;88)</td><td align="left" valign="top">334 (99.4)</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Systolic arterial pressure (mm Hg)</td><td align="left" valign="top">Vital signs</td><td align="left" valign="top">130 (118&#x2010;144)</td><td align="left" valign="top">326 (97.0)</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Mean arterial pressure (mm Hg)</td><td align="left" valign="top">Vital signs</td><td align="left" valign="top">96 (88&#x2010;105)</td><td align="left" valign="top">316 (94.0)</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Respiratory rate (rpm)</td><td align="left" valign="top">Vital signs</td><td align="left" valign="top">16 (15&#x2010;19)</td><td align="left" valign="top">132 (39.3)</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Heart rate (bpm)</td><td align="left" valign="top">Vital signs</td><td align="left" valign="top">85 (74&#x2010;98)</td><td align="left" valign="top">316 (94.0)</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">SIRS<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">Vital signs</td><td align="left" valign="top">0 (0&#x2010;1)</td><td align="left" valign="top">189 (56.3)</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Oxygen saturation</td><td align="left" valign="top">Vital signs</td><td align="left" valign="top">98 (97&#x2010;100)</td><td align="left" valign="top">334 (99.4)</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Temperature (&#x00B0;C)</td><td align="left" valign="top">Vital signs</td><td align="left" valign="top">37.1 (36.6&#x2010;37.5)</td><td align="left" valign="top">333 (99.4)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>EHR: electronic health record.</p></fn><fn id="table2fn2"><p><sup>b</sup>ED: emergency department.</p></fn><fn id="table2fn3"><p><sup>c</sup>AAP: acute abdominal pain.</p></fn><fn id="table2fn4"><p><sup>d</sup>SIRS: systematic inflammatory response syndrome.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Prevalence of 16 clinical features in ED<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> reports (medical history and physical examination; n=336).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Section</td><td align="left" valign="bottom">Classification</td><td align="left" valign="bottom">Class</td><td align="left" valign="bottom">Class prevalence, n (%)</td><td align="left" valign="bottom">Feature prevalence, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Abdominal pain location</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">Multilabel</td><td align="left" valign="top">Right lower quadrant</td><td align="left" valign="top">222 (67.0)</td><td align="left" valign="top">301 (89.6)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Left lower quadrant</td><td align="left" valign="top">32 (9.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Right upper quadrant</td><td align="left" valign="top">28 (8.3)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Left upper quadrant</td><td align="left" valign="top">9 (2.7)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Epigastric region</td><td align="left" valign="top">18 (5.4)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Hypogastric region</td><td align="left" valign="top">11 (3.3)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Periumbilical</td><td align="left" valign="top">27 (8.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Diffuse</td><td align="left" valign="top">21 (6.3)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Nausea</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Binary</td><td align="left" valign="top"/><td align="left" valign="top">177 (52.7)</td><td align="left" valign="top">177 (52.7)</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Development of complaints</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Multilabel</td><td align="left" valign="top">Increase</td><td align="left" valign="top">132 (39.2)</td><td align="left" valign="top">150 (44.6)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Decrease</td><td align="left" valign="top">40 (11.9)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Onset of pain</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Multiclass</td><td align="left" valign="top">Acute</td><td align="left" valign="top">56 (16.7)</td><td align="left" valign="top">63 (18.8)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Gradual</td><td align="left" valign="top">7 (2.1)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">McBurney&#x2019;s sign</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">Binary</td><td align="left" valign="top"/><td align="left" valign="top">118 (35.1)</td><td align="left" valign="top">118 (35.1)</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Rebound tenderness</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">Binary</td><td align="left" valign="top"/><td align="left" valign="top">100 (29.7)</td><td align="left" valign="top">100 (29.7)</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Anorexia</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Binary</td><td align="left" valign="top"/><td align="left" valign="top">122 (36.3)</td><td align="left" valign="top">122 (36.3)</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Pain migration to the right lower quadrant</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Binary</td><td align="left" valign="top"/><td align="left" valign="top">77 (22.9)</td><td align="left" valign="top">77 (22.9)</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Fever</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Binary</td><td align="left" valign="top"/><td align="left" valign="top">70 (20.8)</td><td align="left" valign="top">70 (20.8)</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Abdominal inspection</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">Multilabel</td><td align="left" valign="top">Adipose</td><td align="left" valign="top">44 (13.1)</td><td align="left" valign="top">131 (39.0)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Scars</td><td align="left" valign="top">22 (6.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Distended abdomen</td><td align="left" valign="top">25 (7.4)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">No abnormalities</td><td align="left" valign="top">49 (14.6)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Pollakiuria</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Binary</td><td align="left" valign="top"/><td align="left" valign="top">20 (6.0)</td><td align="left" valign="top">20 (6.0)</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">Stool consistency</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Multilabel</td><td align="left" valign="top">Diarrhea</td><td align="left" valign="top">39 (11.6)</td><td align="left" valign="top">257 (76.5)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Loose</td><td align="left" valign="top">62 (18.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Normal</td><td align="left" valign="top">154 (45.8)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Stiff or obstipation</td><td align="left" valign="top">31 (9.2)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Pain location</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Multilabel</td><td align="left" valign="top">Right lower quadrant</td><td align="left" valign="top">166 (49.4)</td><td align="left" valign="top">303 (91.2)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Left lower quadrant</td><td align="left" valign="top">64 (19.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Right upper quadrant</td><td align="left" valign="top">29 (8.6)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Left upper quadrant</td><td align="left" valign="top">18 (5.4)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Epigastric region</td><td align="left" valign="top">34 (10.1)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Hypogastric region</td><td align="left" valign="top">9 (2.7)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Periumbilical</td><td align="left" valign="top">71 (21.1)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Diffuse</td><td align="left" valign="top">25 (7.4)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Flank right</td><td align="left" valign="top">8 (2.4)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Flank left</td><td align="left" valign="top">4 (1.2)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Flanks both sides</td><td align="left" valign="top">3 (1.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">Pain manifestation</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Multilabel</td><td align="left" valign="top">Attack-wise</td><td align="left" valign="top">71 (21.1)</td><td align="left" valign="top">180 (53.6)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Continuous</td><td align="left" valign="top">138 (41.1)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">Nature of pain</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">Multilabel</td><td align="left" valign="top">Aching</td><td align="left" valign="top">46 (13.7)</td><td align="left" valign="top">122 (36.3)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Cramping</td><td align="left" valign="top">34 (10.1)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Stabbing</td><td align="left" valign="top">66 (19.6)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Burning</td><td align="left" valign="top">2 (0.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">16</td><td align="left" valign="top">Palpation tenderness (supple)</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">Binary</td><td align="left" valign="top"/><td align="left" valign="top">236 (70.2)</td><td align="left" valign="top">236 (70.2)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ED: emergency department.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Ablation Study</title><p>An ablation study on the development set (n=100) quantified the contribution of individual prompt elements (<xref ref-type="table" rid="table4">Tables 4</xref><xref ref-type="table" rid="table5"/>-<xref ref-type="table" rid="table6">6</xref>). Sequentially adding each element to the base prompt showed the largest performance gains for negation or constraint, domain-specific context, and explanation, while section limitation, AI persona, and report type produced smaller and less consistent improvements across features (<xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref>). These findings are incorporated into the composition of the optimized prompts (<xref ref-type="table" rid="table4">Table 4</xref>). Some clarifying elements, particularly negation or constraint elements, were also integrated into the minimal prompt to ensure an equal comparison with manual annotations by ED physicians. Certain elements were not applicable to specific features (eg, when no negation or explanation was possible) and are therefore left blank in <xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref>. <xref ref-type="table" rid="table7">Table 7</xref> lists the minimal prompts used to extract each of the 16 features; these incorporate the same annotation rules that were provided to ED physicians. <xref ref-type="table" rid="table8">Table 8</xref> presents the corresponding optimized prompts, which integrate the prompt elements identified in the ablation study as improving extraction performance. <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> presents all prompt elements evaluated for each feature in the ablation study.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Ablation study (development set, n=100). Performance of base, minimal, and optimized prompting strategies.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Prompting strategies</td><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Base (instruction + output)</td><td align="left" valign="bottom" colspan="3">Minimal</td><td align="left" valign="bottom" colspan="3">Optimized</td></tr></thead><tbody><tr><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003; Feature</td><td align="left" valign="top">Section</td><td align="left" valign="top">Classes</td><td align="left" valign="top">Specificity<break/>&#x00B1;CI</td><td align="left" valign="top">Sensitivity<break/>&#x00B1;CI</td><td align="left" valign="top">Accuracy<break/>&#x00B1; CI</td><td align="left" valign="top">Specificity<break/>&#x00B1; CI</td><td align="left" valign="top">Sensitivity<break/>&#x00B1; CI</td><td align="left" valign="top">Accuracy<break/>&#x00B1; CI</td><td align="left" valign="top">Specificity<break/>&#x00B1; CI</td><td align="left" valign="top">Sensitivity<break/>&#x00B1; CI</td><td align="left" valign="top">Accuracy<break/>&#x00B1; CI</td></tr><tr><td align="left" valign="top">1</td><td align="left" valign="top">Abdominal pain location</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">8</td><td align="left" valign="top">0.93 &#x00B1; 0.02</td><td align="left" valign="top">0.78 &#x00B1; 0.07</td><td align="left" valign="top">0.91 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.02</td><td align="left" valign="top">0.78 &#x00B1; 0.07</td><td align="left" valign="top">0.93 &#x00B1; 0.02</td><td align="left" valign="top">0.96 &#x00B1; 0.01</td><td align="left" valign="top">0.89 &#x00B1; 0.06</td><td align="left" valign="top">0.95 &#x00B1; 0.01</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Nausea</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">1</td><td align="left" valign="top">0.85 &#x00B1; 0.11</td><td align="left" valign="top">0.67 &#x00B1; 0.12</td><td align="left" valign="top">0.76 &#x00B1; 0.08</td><td align="left" valign="top">0.92 &#x00B1; 0.07</td><td align="left" valign="top">0.67 &#x00B1; 0.12</td><td align="left" valign="top">0.80 &#x00B1; 0.07</td><td align="left" valign="top">0.96 &#x00B1; 0.05</td><td align="left" valign="top">0.94 &#x00B1; 0.07</td><td align="left" valign="top">0.95 &#x00B1; 0.04</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Development of complaints</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">2</td><td align="left" valign="top">0.89 &#x00B1; 0.05</td><td align="left" valign="top">0.86 &#x00B1; 0.09</td><td align="left" valign="top">0.88 &#x00B1; 0.05</td><td align="left" valign="top">0.89 &#x00B1; 0.05</td><td align="left" valign="top">0.86 &#x00B1; 0.09</td><td align="left" valign="top">0.88 &#x00B1; 0.05</td><td align="left" valign="top">0.91 &#x00B1; 0.04</td><td align="left" valign="top">0.86 &#x00B1; 0.09</td><td align="left" valign="top">0.90 &#x00B1; 0.04</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Onset of pain</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">2</td><td align="left" valign="top">0.64 &#x00B1; 0.07</td><td align="left" valign="top">0.94 &#x00B1; 0.08</td><td align="left" valign="top">0.67 &#x00B1; 0.07</td><td align="left" valign="top">0.64 &#x00B1; 0.07</td><td align="left" valign="top">0.94 &#x00B1; 0.08</td><td align="left" valign="top">0.67 &#x00B1; 0.07</td><td align="left" valign="top">0.67 &#x00B1; 0.07</td><td align="left" valign="top">0.89 &#x00B1; 0.14</td><td align="left" valign="top">0.69 &#x00B1; 0.06</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">McBurney&#x2019;s sign</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">1</td><td align="left" valign="top">0.80 &#x00B1; 0.10</td><td align="left" valign="top">0.97 &#x00B1; 0.04</td><td align="left" valign="top">0.86 &#x00B1; 0.07</td><td align="left" valign="top">0.91 &#x00B1; 0.07</td><td align="left" valign="top">0.94 &#x00B1; 0.07</td><td align="left" valign="top">0.92 &#x00B1; 0.05</td><td align="left" valign="top">0.94 &#x00B1; 0.05</td><td align="left" valign="top">0.97 &#x00B1; 0.04</td><td align="left" valign="top">0.95 &#x00B1; 0.04</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Rebound tenderness</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">1</td><td align="left" valign="top">0.93 &#x00B1; 0.06</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.95 &#x00B1; 0.04</td><td align="left" valign="top">0.96 &#x00B1; 0.05</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.97 &#x00B1; 0.03</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup>a</sup></td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Anorexia</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">1</td><td align="left" valign="top">0.98 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.97 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.97 &#x00B1; 0.03</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup>a</sup></td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Pain migration to right lower quadrant</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">1</td><td align="left" valign="top">0.90 &#x00B1; 0.07</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.91 &#x00B1; 0.05</td><td align="left" valign="top">0.90 &#x00B1; 0.07</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.91 &#x00B1; 0.05</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup>a</sup></td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Fever</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">1</td><td align="left" valign="top">0.82 &#x00B1; 0.08</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.85 &#x00B1; 0.07</td><td align="left" valign="top">0.83 &#x00B1; 0.08</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.86 &#x00B1; 0.07</td><td align="left" valign="top">&#x2014;<sup>a</sup></td><td align="left" valign="top">&#x2014;<sup>a</sup></td><td align="left" valign="top">&#x2014;<sup>a</sup></td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Abdominal inspection</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">4</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.88 &#x00B1; 0.09</td><td align="left" valign="top">0.98 &#x00B1; 0.01</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.88 &#x00B1; 0.09</td><td align="left" valign="top">0.98 &#x00B1; 0.01</td><td align="left" valign="top">0.99 &#x00B1; 0.01</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">0.99 &#x00B1; 0.01</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Pollakiuria</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">1</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">0.50 &#x00B1; 0.33</td><td align="left" valign="top">0.95 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">0.50 &#x00B1; 0.33</td><td align="left" valign="top">0.95 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">Stool consistency</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">4</td><td align="left" valign="top">0.97 &#x00B1; 0.02</td><td align="left" valign="top">0.67 &#x00B1; 0.10</td><td align="left" valign="top">0.91 &#x00B1; 0.03</td><td align="left" valign="top">0.97 &#x00B1; 0.02</td><td align="left" valign="top">0.67 &#x00B1; 0.10</td><td align="left" valign="top">0.91 &#x00B1; 0.03</td><td align="left" valign="top">0.96 &#x00B1; 0.02</td><td align="left" valign="top">0.87 &#x00B1; 0.07</td><td align="left" valign="top">0.94 &#x00B1; 0.02</td></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Pain location</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">11</td><td align="left" valign="top">0.94 &#x00B1; 0.01</td><td align="left" valign="top">0.70 &#x00B1; 0.08</td><td align="left" valign="top">0.91 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.01</td><td align="left" valign="top">0.79 &#x00B1; 0.07</td><td align="left" valign="top">0.93 &#x00B1; 0.01</td><td align="left" valign="top">0.96 &#x00B1; 0.01</td><td align="left" valign="top">0.83 &#x00B1; 0.07</td><td align="left" valign="top">0.94 &#x00B1; 0.01</td></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">Pain manifestation</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">2</td><td align="left" valign="top">0.77 &#x00B1; 0.07</td><td align="left" valign="top">0.78 &#x00B1; 0.10</td><td align="left" valign="top">0.77 &#x00B1; 0.06</td><td align="left" valign="top">0.77 &#x00B1; 0.07</td><td align="left" valign="top">0.78 &#x00B1; 0.10</td><td align="left" valign="top">0.77 &#x00B1; 0.06</td><td align="left" valign="top">0.94 &#x00B1; 0.04</td><td align="left" valign="top">0.78 &#x00B1; 0.10</td><td align="left" valign="top">0.90 &#x00B1; 0.04</td></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">Nature of pain</td><td align="left" valign="top">Medical history</td><td align="left" valign="top">4</td><td align="left" valign="top">0.96 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.96 &#x00B1; 0.02</td><td align="left" valign="top">0.96 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.96 &#x00B1; 0.02</td><td align="left" valign="top">0.98 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.97 &#x00B1; 0.02</td></tr><tr><td align="left" valign="top">16</td><td align="left" valign="top">Palpation tenderness (supple)</td><td align="left" valign="top">Physical examination</td><td align="left" valign="top">1</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup>a</sup></td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>No prompt element improved performance compared with the minimal prompt; the minimal prompt was retained.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Ablation study (development set, n=100). Incremental effect of individual prompt elements on extraction performance relative to the base prompt.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Prompt element<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="3">AI<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> persona</td><td align="left" valign="bottom" colspan="3">Report type</td><td align="left" valign="bottom" colspan="3">Section limitation</td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top">Feature</td><td align="left" valign="top">Specificity &#x00B1; CI</td><td align="left" valign="top">Sensitivity &#x00B1; CI</td><td align="left" valign="top">Accuracy &#x00B1; CI</td><td align="left" valign="top">Specificity &#x00B1; CI</td><td align="left" valign="top">Sensitivity &#x00B1; CI</td><td align="left" valign="top">Accuracy &#x00B1; CI</td><td align="left" valign="top">Specificity &#x00B1; CI</td><td align="left" valign="top">Sensitivity &#x00B1; CI</td><td align="left" valign="top">Accuracy &#x00B1; CI</td></tr><tr><td align="left" valign="top">1</td><td align="left" valign="top">Abdominal pain location</td><td align="left" valign="top">0.93 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.79 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.91 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.94 &#x00B1; 0.02</td><td align="left" valign="top">0.75 &#x00B1; 0.08</td><td align="left" valign="top">0.91 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.02</td><td align="left" valign="top">0.75 &#x00B1; 0.08</td><td align="left" valign="top">0.92 &#x00B1; 0.02</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Nausea</td><td align="left" valign="top">0.88 &#x00B1; 0.09<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.65 &#x00B1; 0.12<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.77 &#x00B1; 0.08<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.90 &#x00B1; 0.09<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.65 &#x00B1; 0.12<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.78 &#x00B1; 0.08<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.83 &#x00B1; 0.11</td><td align="left" valign="top">0.69 &#x00B1; 0.12</td><td align="left" valign="top">0.76 &#x00B1; 0.08</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Development of complaints</td><td align="left" valign="top">0.84 &#x00B1; 0.06</td><td align="left" valign="top">0.86 &#x00B1; 0.09</td><td align="left" valign="top">0.84 &#x00B1; 0.05</td><td align="left" valign="top">0.83 &#x00B1; 0.06</td><td align="left" valign="top">0.86 &#x00B1; 0.09</td><td align="left" valign="top">0.83 &#x00B1; 0.05</td><td align="left" valign="top">0.71 &#x00B1; 0.07</td><td align="left" valign="top">0.82 &#x00B1; 0.11</td><td align="left" valign="top">0.74 &#x00B1; 0.06</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Onset of pain</td><td align="left" valign="top">0.62 &#x00B1; 0.07</td><td align="left" valign="top">0.94 &#x00B1; 0.08</td><td align="left" valign="top">0.65 &#x00B1; 0.07</td><td align="left" valign="top">0.62 &#x00B1; 0.07</td><td align="left" valign="top">0.94 &#x00B1; 0.08</td><td align="left" valign="top">0.65 &#x00B1; 0.07</td><td align="left" valign="top">0.57 &#x00B1; 0.07</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.60 &#x00B1; 0.07</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">McBurney&#x2019;s sign</td><td align="left" valign="top">0.80 &#x00B1; 0.10</td><td align="left" valign="top">0.97 &#x00B1; 0.04</td><td align="left" valign="top">0.86 &#x00B1; 0.07</td><td align="left" valign="top">0.80 &#x00B1; 0.10</td><td align="left" valign="top">0.97 &#x00B1; 0.04</td><td align="left" valign="top">0.86 &#x00B1; 0.07</td><td align="left" valign="top">0.75 &#x00B1; 0.11</td><td align="left" valign="top">0.97 &#x00B1; 0.04</td><td align="left" valign="top">0.83 &#x00B1; 0.07</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Rebound tenderness</td><td align="left" valign="top">0.93 &#x00B1; 0.06</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.95 &#x00B1; 0.04</td><td align="left" valign="top">0.91 &#x00B1; 0.06</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.94 &#x00B1; 0.04</td><td align="left" valign="top">0.91 &#x00B1; 0.06</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.94 &#x00B1; 0.04</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Anorexia</td><td align="left" valign="top">0.98 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.97 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.02</td><td align="left" valign="top">0.92 &#x00B1; 0.08</td><td align="left" valign="top">0.96 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.97 &#x00B1; 0.03</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Pain migration to the right lower quadrant</td><td align="left" valign="top">0.68 &#x00B1; 0.10</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.74 &#x00B1; 0.08</td><td align="left" valign="top">0.73 &#x00B1; 0.10</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.78 &#x00B1; 0.08</td><td align="left" valign="top">0.87 &#x00B1; 0.07</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.90 &#x00B1; 0.05</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Fever</td><td align="left" valign="top">0.82 &#x00B1; 0.08</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.85 &#x00B1; 0.07</td><td align="left" valign="top">0.82 &#x00B1; 0.08</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.85 &#x00B1; 0.07</td><td align="left" valign="top">0.82 &#x00B1; 0.08</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.85 &#x00B1; 0.07</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Abdominal inspection</td><td align="left" valign="top">0.99 &#x00B1; 0.01</td><td align="left" valign="top">0.91 &#x00B1; 0.08</td><td align="left" valign="top">0.98 &#x00B1; 0.01</td><td align="left" valign="top">0.99 &#x00B1; 0.01</td><td align="left" valign="top">0.91 &#x00B1; 0.08</td><td align="left" valign="top">0.98 &#x00B1; 0.01</td><td align="left" valign="top">0.99 &#x00B1; 0.01</td><td align="left" valign="top">0.88 &#x00B1; 0.09</td><td align="left" valign="top">0.98 &#x00B1; 0.01</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Pollakiuria</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">0.50 &#x00B1; 0.33</td><td align="left" valign="top">0.95 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">0.50 &#x00B1; 0.33</td><td align="left" valign="top">0.95 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">0.50 &#x00B1; 0.33</td><td align="left" valign="top">0.95 &#x00B1; 0.03</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">Stool consistency</td><td align="left" valign="top">0.97 &#x00B1; 0.02</td><td align="left" valign="top">0.66 &#x00B1; 0.10</td><td align="left" valign="top">0.90 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.01</td><td align="left" valign="top">0.67 &#x00B1; 0.10</td><td align="left" valign="top">0.91 &#x00B1; 0.03</td><td align="left" valign="top">0.96 &#x00B1; 0.02</td><td align="left" valign="top">0.74 &#x00B1; 0.09</td><td align="left" valign="top">0.91 &#x00B1; 0.03</td></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Pain location</td><td align="left" valign="top">0.93 &#x00B1; 0.02</td><td align="left" valign="top">0.69 &#x00B1; 0.08</td><td align="left" valign="top">0.91 &#x00B1; 0.02</td><td align="left" valign="top">0.94 &#x00B1; 0.02</td><td align="left" valign="top">0.73 &#x00B1; 0.08</td><td align="left" valign="top">0.91 &#x00B1; 0.02</td><td align="left" valign="top">0.94 &#x00B1; 0.01<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.76 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.92 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">Pain manifestation</td><td align="left" valign="top">0.74 &#x00B1; 0.07</td><td align="left" valign="top">0.81 &#x00B1; 0.10</td><td align="left" valign="top">0.77 &#x00B1; 0.06</td><td align="left" valign="top">0.77 &#x00B1; 0.07</td><td align="left" valign="top">0.78 &#x00B1; 0.10</td><td align="left" valign="top">0.77 &#x00B1; 0.06</td><td align="left" valign="top">0.63 &#x00B1; 0.08</td><td align="left" valign="top">0.76 &#x00B1; 0.11</td><td align="left" valign="top">0.67 &#x00B1; 0.06</td></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">Nature of pain</td><td align="left" valign="top">0.98 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.95 &#x00B1; 0.06<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.97 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.91 &#x00B1; 0.03</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.92 &#x00B1; 0.03</td><td align="left" valign="top">0.88 &#x00B1; 0.03</td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">0.89 &#x00B1; 0.03</td></tr><tr><td align="left" valign="top">16</td><td align="left" valign="top">Palpation tenderness (supple)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Each prompt element shows the isolated incremental effect compared to the base prompt.</p></fn><fn id="table5fn2"><p><sup>b</sup>AI: artificial intelligence.</p></fn><fn id="table5fn3"><p><sup>c</sup>Element retained in the optimized prompt configuration.</p></fn><fn id="table5fn4"><p><sup>d</sup>Prompt element was not applicable and evaluated for that specific variable.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Ablation study (development set, n=100). Incremental effect of individual prompt elements on extraction performance relative to the base prompt.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top"/><td align="left" valign="top">Prompt element<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup></td><td align="left" valign="top" colspan="3">Negation or constraint</td><td align="left" valign="top" colspan="3">Domain-specific context</td><td align="left" valign="top" colspan="3">Explanation</td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top">Feature</td><td align="left" valign="top">Specificity &#x00B1; CI</td><td align="left" valign="top">Sensitivity &#x00B1; CI</td><td align="left" valign="top">Accuracy &#x00B1; CI</td><td align="left" valign="top">Specificity &#x00B1; CI</td><td align="left" valign="top">Sensitivity &#x00B1; CI</td><td align="left" valign="top">Accuracy &#x00B1; CI</td><td align="left" valign="top">Specificity &#x00B1; CI</td><td align="left" valign="top">Sensitivity &#x00B1; CI</td><td align="left" valign="top">Accuracy &#x00B1; CI</td></tr><tr><td align="left" valign="top">1</td><td align="left" valign="top">Abdominal pain location</td><td align="left" valign="top">0.95 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.78 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.93 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.94 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.87 &#x00B1; 0.06<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.93 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.94 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.78 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.92 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Nausea</td><td align="left" valign="top">0.92 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.67 &#x00B1; 0.12<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.80 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.87 &#x00B1; 0.09<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.81 &#x00B1; 0.10<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.84 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.90 &#x00B1; 0.09<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.85 &#x00B1; 0.09<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.88 &#x00B1; 0.06<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Development of complaints</td><td align="left" valign="top">0.91 &#x00B1; 0.04<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.86 &#x00B1; 0.09<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.90 &#x00B1; 0.04<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.78 &#x00B1; 0.07</td><td align="left" valign="top">0.90 &#x00B1; 0.09</td><td align="left" valign="top">0.81 &#x00B1; 0.05</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Onset of pain</td><td align="left" valign="top">0.67 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.89 &#x00B1; 0.14<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.69 &#x00B1; 0.06<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.62 &#x00B1; 0.07</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.65 &#x00B1; 0.07</td><td align="left" valign="top">0.59 &#x00B1; 0.07</td><td align="left" valign="top">0.94 &#x00B1; 0.08</td><td align="left" valign="top">0.62 &#x00B1; 0.06</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">McBurney&#x2019;s sign</td><td align="left" valign="top">0.91 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.94 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.92 &#x00B1; 0.05<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.91 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.94 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.92 &#x00B1; 0.05<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.73 &#x00B1; 0.11</td><td align="left" valign="top">0.94 &#x00B1; 0.07</td><td align="left" valign="top">0.81 &#x00B1; 0.08</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Rebound tenderness</td><td align="left" valign="top">0.96 &#x00B1; 0.05<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">1.00 &#x00B1; 0.00<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.97 &#x00B1; 0.03<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">0.93 &#x00B1; 0.06</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.95 &#x00B1; 0.04</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Anorexia</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.97 &#x00B1; 0.04</td><td align="left" valign="top">0.96 &#x00B1; 0.03</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.97 &#x00B1; 0.04</td><td align="left" valign="top">0.96 &#x00B1; 0.03</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Pain migration to the right lower quadrant</td><td align="left" valign="top">0.72 &#x00B1; 0.10</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.77 &#x00B1; 0.08</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">0.87 &#x00B1; 0.07</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.90 &#x00B1; 0.05</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Fever</td><td align="left" valign="top">0.78 &#x00B1; 0.10</td><td align="left" valign="top">0.91 &#x00B1; 0.11</td><td align="left" valign="top">0.81 &#x00B1; 0.08</td><td align="left" valign="top">0.83 &#x00B1; 0.08<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.95 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.86 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.74 &#x00B1; 0.10</td><td align="left" valign="top">0.95 &#x00B1; 0.07</td><td align="left" valign="top">0.79 &#x00B1; 0.08</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Abdominal inspection</td><td align="left" valign="top">1.00 &#x00B1; 0.00</td><td align="left" valign="top">0.88 &#x00B1; 0.09</td><td align="left" valign="top">0.98 &#x00B1; 0.01</td><td align="left" valign="top">0.99 &#x00B1; 0.01<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.98 &#x00B1; 0.03<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.99 &#x00B1; 0.01<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.99 &#x00B1; 0.01</td><td align="left" valign="top">0.88 &#x00B1; 0.09</td><td align="left" valign="top">0.98 &#x00B1; 0.01</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Pollakiuria</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">0.98 &#x00B1; 0.03<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">1.00 &#x00B1; 0.00<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.98 &#x00B1; 0.03<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup><sup>d</sup></td><td align="left" valign="top">0.98 &#x00B1; 0.03</td><td align="left" valign="top">0.50 &#x00B1; 0.33</td><td align="left" valign="top">0.95 &#x00B1; 0.03</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">Stool consistency</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">0.97 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.81 &#x00B1; 0.08<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.94 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.97 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.73 &#x00B1; 0.09<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.92 &#x00B1; 0.03<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Pain location</td><td align="left" valign="top">0.95 &#x00B1; 0.01<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.79 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.93 &#x00B1; 0.01<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> <sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.94 &#x00B1; 0.01<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.78 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.93 &#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">Pain manifestation</td><td align="left" valign="top">0.92 &#x00B1; 0.05<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.75 &#x00B1; 0.11<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.87 &#x00B1; 0.05<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.72 &#x00B1; 0.07</td><td align="left" valign="top">0.83 &#x00B1; 0.09</td><td align="left" valign="top">0.75 &#x00B1; 0.06</td><td align="left" valign="top">0.79 &#x00B1; 0.07<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.80 &#x00B1; 0.10<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.80 &#x00B1; 0.05<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">Nature of pain</td><td align="left" valign="top">0.96 &#x00B1; 0.02</td><td align="left" valign="top">0.95 &#x00B1; 0.06</td><td align="left" valign="top">0.96 &#x00B1; 0.02</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">16</td><td align="left" valign="top">Palpation tenderness (supple)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>Each prompt element shows the isolated incremental effect compared to the base prompt.</p></fn><fn id="table6fn2"><p><sup>b</sup>Element retained in the minimal prompt configuration. (Additional instruction shared with physician)</p></fn><fn id="table6fn3"><p><sup>c</sup>Element retained in the optimized prompt configuration.</p></fn><fn id="table6fn4"><p><sup>d</sup>Prompt element was not applicable and evaluated for that specific variable.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Minimal 0-shot prompts as presented to the LLM.<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup></p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Prompt<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Abdominal pain location</td><td align="left" valign="top">Report where abdominal pain is present in this patient. If multiple locations of abdominal pain are mentioned, report the most painful location (PM<sup><xref ref-type="table-fn" rid="table7fn3">c</xref></sup>).<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup> Limit your answer to the following options: &#x201C;right lower quadrant,&#x201D; &#x201C;right upper quadrant,&#x201D; &#x201C;left lower quadrant,&#x201D; &#x201C;left upper quadrant,&#x201D; &#x201C;entire upper abdomen,&#x201D; &#x201C;entire lower abdomen,&#x201D; &#x201C;entire left abdomen,&#x201D; &#x201C;entire right abdomen,&#x201D; &#x201C;periumbilical,&#x201D; &#x201C;epigastric,&#x201D; &#x201C;hypogastric,&#x201D; &#x201C;diffuse,&#x201D; or &#x201C;absent.&#x201D; Report &#x201C;absent&#x201D; if abdominal pain is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Nausea</td><td align="left" valign="top">Report whether this patient has nausea based on the last status in the report. Ignore vomiting. Vomiting is not the same as nausea.<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup> Limit your answer to the following options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Report &#x201C;no&#x201D; if nausea is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Development of complaints</td><td align="left" valign="top">Report whether there is an increase or decrease in the development of complaints after the onset in this patient. A decrease in the development of complaints occurring after the administration of pain medication must be ignored and should not be reported as a decrease.<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup> Limit your answer to the following options: &#x201C;increase,&#x201D; &#x201C;decrease,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, the answer should be &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Onset of pain</td><td align="left" valign="top">Report whether there is a description of how the onset of pain started in this patient. Limit your answer to the following options: &#x201C;acute,&#x201D; &#x201C;gradual,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, the answer should be &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">McBurney&#x2019;s sign</td><td align="left" valign="top">Report whether this patient has abdominal pain at McBurney&#x2019;s point. If McBurney&#x2019;s sign is dubious, report &#x201C;no.&#x201D;<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup> Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Report &#x201C;no&#x201D; if abdominal pain at McBurney's point is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Rebound tenderness</td><td align="left" valign="top">Report whether this patient has rebound tenderness. Ignore contralateral rebound tenderness.<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup> Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Report &#x201C;no&#x201D; if rebound tenderness is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Anorexia</td><td align="left" valign="top">Report whether this patient has anorexia. Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Report "no&#x201D; if anorexia is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Pain migration to the right lower quadrant</td><td align="left" valign="top">Report whether the abdominal pain has migrated to the right lower quadrant in the patient. Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Report &#x201C;no&#x201D; if migration to the right lower quadrant is not mentioned or explicitly absent</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Fever</td><td align="left" valign="top">Report whether this patient has a fever. Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Note that the term may not be mentioned exactly as written. Yes=recorded elevated temperature, feeling of fever, measured fever, feverish, temperature or &#x201C;T&#x201D; of 38&#x00B0;C or higher. No=temperature or &#x201C;T&#x201D; of 37.9&#x00B0;C or lower, no fever, not feverish, no elevated temperature.<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup> Report &#x201C;no&#x201D; if fever is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Abdominal inspection</td><td align="left" valign="top">Report whether any abnormalities are noted in the inspection of the abdomen for this patient. Limit your answer to the following options: &#x201C;adipose,&#x201D; &#x201C;distended abdomen,&#x201D; &#x201C;scars,&#x201D; &#x201C;no abnormalities,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, answer &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Pollakiuria</td><td align="left" valign="top">Report whether this patient has pollakiuria. Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Report &#x201C;no&#x201D; if pollakiuria is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">Stool consistency</td><td align="left" valign="top">Report whether stool consistency is described for this patient. Limit your answer to the following options: &#x201C;diarrhea,&#x201D; &#x201C;loose,&#x201D; &#x201C;normal,&#x201D; &#x201C;stiff/obstipation,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, answer &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Pain location</td><td align="left" valign="top">Report where the pain initially started in this patient. Ignore pain resulting from displacement, migration, or radiation. Ignore abdominal tenderness. Ignore findings from the physical examination.<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup> Limit your answer to the following options: &#x201C;right lower quadrant,&#x201D; &#x201C;right upper quadrant,&#x201D; &#x201C;left lower quadrant,&#x201D; &#x201C;left upper quadrant,&#x201D; &#x201C;entire upper abdomen,&#x201D; &#x201C;entire lower abdomen,&#x201D; &#x201C;entire left abdomen,&#x201D; &#x201C;entire right abdomen,&#x201D; &#x201C;around the navel,&#x201D; &#x201C;epigastric,&#x201D; &#x201C;hypogastric,&#x201D; &#x201C;diffuse,&#x201D; &#x201C;bilateral flanks,&#x201D; &#x201C;right flank&#x201D; &#x201C;left flank,&#x201D; and &#x201C;absent.&#x201D; Report &#x201C;absent&#x201D; if the location where the pain initially started is not mentioned.</td></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">Pain manifestation</td><td align="left" valign="top">Report whether this patient has a specific abdominal pain pattern. Limit your answer to the following options: &#x201C;attack wise,&#x201D; &#x201C;continuous,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, answer &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">Nature of pain</td><td align="left" valign="top">Report whether the nature of pain is described for this patient. Limit your answer to the following options: &#x201C;stabbing,&#x201D; &#x201C;aching,&#x201D; &#x201C;cramping,&#x201D; &#x201C;burning,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, answer &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">16</td><td align="left" valign="top">Palpation tenderness</td><td align="left" valign="top">Report whether this patient has a supple abdomen upon palpation. Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Report &#x201C;no&#x201D; if a supple abdomen upon palpation is not mentioned or explicitly absent.</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table7fn2"><p><sup>b</sup>All prompts and emergency department reports were provided to the large language model in Dutch and have been translated into English for readability. </p></fn><fn id="table7fn3"><p><sup>c</sup>PM: punctum maximum.</p></fn><fn id="table7fn4"><p><sup>d</sup>Annotation rules shared with or instructed to emergency department physicians for manual labeling and included in the minimal prompt. </p></fn></table-wrap-foot></table-wrap><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Optimized 0-shot prompts as presented to the LLM.<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup></p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Prompt<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Abdominal pain location</td><td align="left" valign="top">You are a medical information extraction specialist.<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> Report where abdominal pain is present in this patient. If multiple locations of abdominal pain are mentioned, report the most painful location (punctum maximum/PM).<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup> Limit your answer to the following options: &#x201C;right lower quadrant,&#x201D; &#x201C;right upper quadrant,&#x201D; &#x201C;left lower quadrant,&#x201D; &#x201C;left upper quadrant,&#x201D; &#x201C;entire upper abdomen,&#x201D; &#x201C;entire lower abdomen,&#x201D; &#x201C;entire left abdomen,&#x201D; &#x201C;entire right abdomen,&#x201D; &#x201C;periumbilical,&#x201D; &#x201C;epigastric,&#x201D; &#x201C;hypogastric,&#x201D; &#x201C;diffuse,&#x201D; or &#x201C;absent.&#x201D; Note that these terms may not be mentioned exactly as listed. Right lower quadrant=RLQ, RLA, McBurney, right iliac fossa. Right upper quadrant=RUQ, Murphy. Left lower quadrant=LLQ, LLA, left iliac fossa. Left upper quadrant=LUQ. Entire right abdomen=right hemiabdomen. Entire left abdomen=left hemiabdomen. Periumbilical=around the navel, paraumbilical, supraumbilical. Epigastric=stomach region. Hypogastric=uterus, suprapubic, above pubis. Diffuse=entire abdomen. Absent=no abdominal pain. During the physical examination, the physician presses on all these locations and describes where pain is felt.<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup> Report &#x201C;absent&#x201D; if pain is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Nausea</td><td align="left" valign="top">You are a medical information extraction specialist.<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> Your task is to report from the emergency department report whether<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> this patient has nausea based on the last status in the report. Ignore vomiting. Vomiting is not the same as nausea.<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup> Limit your answer to the following options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Note that these terms may not be mentioned exactly as listed. Yes=N+, N+. No=N&#x2212;, N&#x2212;. &#x201C;N&#x201D; stands for nausea. &#x201C;N&#x201D; often appears in combination with &#x201C;V,&#x201D; for example, &#x201C;N+ V&#x2212;.&#x201D; &#x201C;+&#x201D; means nausea is present. &#x201C;&#x2212;&#x201D; means nausea is absent.<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup> Report &#x201C;no&#x201D; if nausea is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Development of complaints</td><td align="left" valign="top">Report whether there is an increase or decrease in the development of complaints after the onset in this patient. A decrease in the development of complaints occurring after the administration of pain medication must be ignored and should not be reported as a decrease.<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup> Ignore migration and radiation of abdominal pain. These do not count as an increase in the progression of abdominal pain complaints.<sup><xref ref-type="table-fn" rid="table8fn6">f</xref></sup> Limit your answer to the following options: &#x201C;increase,&#x201D; &#x201C;decrease,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, the answer should be &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Onset of pain</td><td align="left" valign="top">Report whether there is a description of how the onset of pain started in this patient. Ignore the timing of onset and focus only on how the complaints began.<sup><xref ref-type="table-fn" rid="table8fn6">f</xref></sup> Limit your answer to the following options: &#x201C;acute,&#x201D; &#x201C;gradual,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, the answer should be &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">McBurney&#x2019;s sign</td><td align="left" valign="top">Report whether this patient has abdominal pain at McBurney's point. If McBurney's sign is dubious, report &#x201C;no.&#x201D;<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup> Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Note that the term may not be mentioned exactly as listed. Yes=McBurney +, PM McBurney, punctum maximum McBurney. No=McBurney &#x2212;, no pain at McBurney's point, not at McBurney.<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup> Report &#x201C;no&#x201D; if abdominal pain at McBurney's point is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Rebound tenderness</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table8fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Anorexia</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table8fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Pain migration to right lower quadrant</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table8fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Fever</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table8fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Abdominal inspection</td><td align="left" valign="top">Report whether any abnormalities are noted in the inspection of the abdomen for this patient. Limit your answer to the following options: &#x201C;adipose,&#x201D; &#x201C;distended abdomen,&#x201D; &#x201C;scars,&#x201D; &#x201C;no abnormalities,&#x201D; or &#x201C;not reported.&#x201D; Note that the terms may not be mentioned exactly as listed: adipose=obese, adiposity. Distended abdomen=bloated, distended, tense. Scar=Pfannenstiel.<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup> If none of the mentioned options are explicitly stated in the text, answer &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Pollakiuria</td><td align="left" valign="top">Report whether this patient has pollakiuria. Limit your answer to the options: &#x201C;yes&#x201D; or &#x201C;no.&#x201D; Note that the term may not be mentioned exactly as listed: Yes=urinating in small amounts, frequent urination, urinating more often than normal, pollakiuria +. No=pollakiuria &#x2212;.<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup> Report &#x201C;no&#x201D; if pollakiuria is not mentioned or explicitly absent.</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">Stool consistency</td><td align="left" valign="top">Report whether the stool consistency is described for this patient. Limit your answer to the following options: &#x201C;diarrhea,&#x201D; &#x201C;loose,&#x201D; &#x201C;normal,&#x201D; &#x201C;stiff/obstipation,&#x201D; or &#x201C;not reported.&#x201D; Note that the terms may not be mentioned exactly as listed: diarrhea=watery, liquid stool, extremely loose, explosive. Normal=na (no abnormalities), no notable findings, no deviations, no diarrhea or obstipation. Constipation=difficult passage. Stool may also be referred to as defecation or def.<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup> If none of the mentioned options are explicitly stated in the text, answer &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Pain location</td><td align="left" valign="top">Report where the pain initially started in this patient. Limit yourself to the medical history section.<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> Ignore pain resulting from displacement, migration, or radiation. Ignore abdominal tenderness. Ignore findings from the physical examination.<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup> Limit your answer to the following options: &#x201C;right lower quadrant,&#x201D; &#x201C;right upper quadrant,&#x201D; &#x201C;left lower quadrant,&#x201D; &#x201C;left upper quadrant,&#x201D; &#x201C;entire upper abdomen,&#x201D; &#x201C;entire lower abdomen,&#x201D; &#x201C;entire left abdomen,&#x201D; &#x201C;entire right abdomen,&#x201D; &#x201C;around the navel,&#x201D; &#x201C;epigastric,&#x201D; &#x201C;hypogastric,&#x201D; &#x201C;diffuse,&#x201D; &#x201C;bilateral flanks,&#x201D; &#x201C;right flank,&#x201D; &#x201C;left flank,&#x201D; &#x201C;absent.&#x201D; Note that the terms may not be mentioned exactly as listed: right lower quadrant=RLQ, RLA. Right upper quadrant=RUQ. Left lower quadrant=LLQ, LLA. Left upper quadrant=LUQ. Around the navel=periumbilical, para-umbilical. Entire right abdomen=right hemiabdomen. Entire left abdomen=left hemiabdomen. Entire lower abdomen=lower part of the abdomen. Hypogastric=uterus, suprapubic, above pubis. Epigastric=stomach region. Diffuse=entire abdomen.<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup> Report &#x201C;absent&#x201D; if the location where the pain initially started is not mentioned.</td></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">Pain manifestation</td><td align="left" valign="top">Report whether this patient has a specific abdominal pain pattern. Ignore descriptions of acute, sudden, or stabbing abdominal pain, as these are not specific pain patterns.<sup><xref ref-type="table-fn" rid="table8fn6">f</xref></sup> Limit your answer to the following options: &#x201C;attack wise,&#x201D; &#x201C;continuous,&#x201D; or &#x201C;not reported.&#x201D; Note that the terms may not be mentioned exactly as written: attack wise=worsening in waves, peak periods, fluctuating, intermittent, variable in intensity, flare-ups, colicky pain, on and off.<sup>e</sup> If none of the mentioned options are explicitly stated in the text, answer &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">Nature of pain</td><td align="left" valign="top">You are a medical information extraction specialist.<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> Report whether the nature of pain is described for this patient. Limit your answer to the following options: &#x201C;stabbing,&#x201D; &#x201C;aching,&#x201D; &#x201C;cramping,&#x201D; &#x201C;burning,&#x201D; or &#x201C;not reported.&#x201D; If none of the mentioned options are explicitly stated in the text, answer &#x201C;not reported.&#x201D;</td></tr><tr><td align="left" valign="top">16</td><td align="left" valign="top">Palpation Tenderness</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table8fn7">g</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table8fn2"><p><sup>b</sup>All prompts and emergency department reports were provided to the large language model in Dutch and have been translated to English for readability. </p></fn><fn id="table8fn3"><p><sup>c</sup>Broader context: (1) artificial intelligence persona or (2) report type. These are prompt elements included in the optimized prompt. </p></fn><fn id="table8fn4"><p><sup>d</sup>Annotation rules shared with or instructed to emergency department physicians for manual labeling and included in the minimal prompt. </p></fn><fn id="table8fn5"><p><sup>e</sup>Specific context: (5) domain-appropriate terminology or abbreviations, or (6) explanation of the symptom.</p></fn><fn id="table8fn6"><p><sup>f</sup>Instruction: (3) section limitation or (4) negations or constraints. </p></fn><fn id="table8fn7"><p><sup>g</sup>No prompt element improved performance compared with the minimal prompt; the minimal prompt was retained.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Minimal vs Optimized Prompt Strategies</title><p><xref ref-type="table" rid="table9">Table 9</xref> presents the feature-level performance of the LLM under both prompt strategies, evaluated against a reference standard based on lead researcher annotations, shown alongside manual labeling by ED physicians on the evaluation set (n=236). Using the minimal prompt strategy, the LLM achieved an average specificity of 0.929 (95% CI &#x00B1;0.020), sensitivity of 0.810 (95% CI &#x00B1;0.058), and accuracy of 0.910 (95% CI &#x00B1;0.018). Lower feature prevalence in the ED reports did not consistently result in lower LLM accuracy (<xref ref-type="table" rid="table3">Table 3</xref>). When applying the optimized prompt strategy, performance improved across all metrics; specificity increased to 0.943 (95% CI &#x00B1;0.017), sensitivity to 0.859 (95% CI &#x00B1;0.055), and accuracy to 0.929 (95% CI &#x00B1;0.016). The 5 largest gains in accuracy were observed for nausea, pain manifestation, onset of pain, pain location, and abdominal pain location. No gains appeared for rebound tenderness, anorexia, pain migration to the right lower quadrant, fever, and palpation tenderness. Optimized prompts were particularly beneficial for features involving domain-specific terminology, abbreviations, or overlapping definitions, such as nausea and vomiting.</p><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Extraction performance of the LLM<sup><xref ref-type="table-fn" rid="table9fn1">a</xref></sup>, under both prompting strategies, vs ED<sup><xref ref-type="table-fn" rid="table9fn2">b</xref></sup> physicians on the evaluation set (n=236). Each row represents a clinical feature with the number of positive answer options (classes).</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Minimal 0-shot</td><td align="left" valign="bottom" colspan="3">Optimized 0-shot</td><td align="left" valign="bottom" colspan="3">ED physician 1</td><td align="left" valign="bottom" colspan="3">ED physician 2</td><td align="left" valign="bottom" colspan="3">Comparisons<sup><xref ref-type="table-fn" rid="table9fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Feature</td><td align="left" valign="top">Classes<sup><xref ref-type="table-fn" rid="table9fn4">d</xref></sup></td><td align="left" valign="top">Spec.<sup><xref ref-type="table-fn" rid="table9fn5">e</xref></sup><break/>&#x00B1; CI</td><td align="left" valign="top">Sens.<sup><xref ref-type="table-fn" rid="table9fn6">f</xref></sup><break/>&#x00B1; CI</td><td align="left" valign="top">Acc.<sup><xref ref-type="table-fn" rid="table9fn7">g</xref></sup><break/>&#x00B1; CI</td><td align="left" valign="top">Spec.<break/>&#x00B1; CI</td><td align="left" valign="top">Sens.<break/>&#x00B1; CI</td><td align="left" valign="top">Acc.<break/>&#x00B1; CI</td><td align="left" valign="top">Spec.<break/>&#x00B1; CI</td><td align="left" valign="top">Sens.<break/>&#x00B1; CI</td><td align="left" valign="top">Acc.<break/>&#x00B1; CI</td><td align="left" valign="top">Spec.<break/>&#x00B1; CI</td><td align="left" valign="top">Sens.<break/>&#x00B1; CI</td><td align="left" valign="top">Acc.<break/>&#x00B1; CI</td><td align="left" valign="top">&#x0394; Acc.<sup><xref ref-type="table-fn" rid="table9fn8">h</xref></sup></td><td align="left" valign="top">&#x0394; Acc.<sup><xref ref-type="table-fn" rid="table9fn9">i</xref></sup></td><td align="left" valign="top">&#x0394; Acc.<sup><xref ref-type="table-fn" rid="table9fn10">j</xref></sup></td></tr><tr><td align="left" valign="top">Abdominal pain location</td><td align="left" valign="top">8</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.76<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.87<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.94<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.78<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.02</td><td align="left" valign="top">&#x2212;0.04</td><td align="left" valign="top">0.01</td></tr><tr><td align="left" valign="top">Nausea</td><td align="left" valign="top">1</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.74<break/>&#x00B1; 0.08</td><td align="left" valign="top">0.82<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.94<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.82<break/>&#x00B1; 0.07</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.89<break/>&#x00B1; 0.04</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.11</td><td align="left" valign="top">0.05</td><td align="left" valign="top">&#x2212;0.04</td></tr><tr><td align="left" valign="top">Development of complaints</td><td align="left" valign="top">2</td><td align="left" valign="top">0.88<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.84<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.87<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.81<break/>&#x00B1; 0.07</td><td align="left" valign="top">0.88<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.76<break/>&#x00B1; 0.07</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.74<break/>&#x00B1; 0.08</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.01</td><td align="left" valign="top">&#x2212;0.05</td><td align="left" valign="top">&#x2212;0.03</td></tr><tr><td align="left" valign="top">Onset of pain</td><td align="left" valign="top">2</td><td align="left" valign="top">0.62<break/>&#x00B1; 0.08</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.65<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.65<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.85<break/>&#x00B1; 0.10</td><td align="left" valign="top">0.67<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.69<break/>&#x00B1; 0.13</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.84<break/>&#x00B1; 0.10</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.03</td><td align="left" valign="top">&#x2212;0.29</td><td align="left" valign="top">&#x2212;0.31</td></tr><tr><td align="left" valign="top">McBurney&#x2019;s sign</td><td align="left" valign="top">1</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.03</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.01</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.00</td></tr><tr><td align="left" valign="top">Rebound tenderness</td><td align="left" valign="top">1</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.97<break/>&#x00B1; 0.03<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.97<break/>&#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.84<break/>&#x00B1; 0.09</td><td align="left" valign="top">0.94<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.03</td></tr><tr><td align="left" valign="top">Anorexia</td><td align="left" valign="top">1</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.83<break/>&#x00B1; 0.08</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.0<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.83<break/>&#x00B1; 0.0<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.92<break/>&#x00B1; 0.04<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.81<break/>&#x00B1; 0.08</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.59<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.85<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.07</td></tr><tr><td align="left" valign="top">Pain migration to right lower quadrant</td><td align="left" valign="top">1</td><td align="left" valign="top">0.88<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.89<break/>&#x00B1; 0.08</td><td align="left" valign="top">0.88<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.88<break/>&#x00B1; 0.05<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.89<break/>&#x00B1; 0.08<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.88<break/>&#x00B1; 0.04<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.97<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.86<break/>&#x00B1; 0.09</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.89<break/>&#x00B1; 0.08</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.00</td><td align="left" valign="top">&#x2212;0.06</td><td align="left" valign="top">&#x2212;0.07</td></tr><tr><td align="left" valign="top">Fever</td><td align="left" valign="top">1</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.77<break/>&#x00B1; 0.11</td><td align="left" valign="top">0.89<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.04<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.77<break/>&#x00B1; 0.11<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.89<break/>&#x00B1; 0.04<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.94<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.07</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.00</td><td align="left" valign="top">&#x2212;0.05</td><td align="left" valign="top">&#x2212;0.05</td></tr><tr><td align="left" valign="top">Abdominal inspection</td><td align="left" valign="top">4</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.89<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.00</td><td align="left" valign="top">&#x2212;0.00</td><td align="left" valign="top">&#x2212;0.01</td></tr><tr><td align="left" valign="top">Pollakiuria</td><td align="left" valign="top">1</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.57<break/>&#x00B1; 0.25</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.86<break/>&#x00B1; 0.18</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.46<break/>&#x00B1; 0.23</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.01</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.69<break/>&#x00B1; 0.23</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.02</td><td align="left" valign="top">0.02</td><td align="left" valign="top">0.01</td></tr><tr><td align="left" valign="top">Stool consistency</td><td align="left" valign="top">4</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.70<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.83<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.75<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.73<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.94<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.01</td><td align="left" valign="top">0.02</td><td align="left" valign="top">&#x2212;0.01</td></tr><tr><td align="left" valign="top">Pain location</td><td align="left" valign="top">11</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.75<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.80<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.85<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.71<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.02</td><td align="left" valign="top">&#x2212;0.02</td><td align="left" valign="top">&#x2212;0.01</td></tr><tr><td align="left" valign="top">Pain manifestation</td><td align="left" valign="top">2</td><td align="left" valign="top">0.75<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.85<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.78<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.79<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.88<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.87<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.94<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.04</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.11</td><td align="left" valign="top">&#x2212;0.06</td><td align="left" valign="top">&#x2212;0.05</td></tr><tr><td align="left" valign="top">Nature of pain</td><td align="left" valign="top">4</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.92<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.94<break/>&#x00B1; 0.02</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.99<break/>&#x00B1; 0.01</td><td align="left" valign="top">1.00<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.82<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.00</td><td align="left" valign="top">&#x2212;0.05</td><td align="left" valign="top">&#x2212;0.04</td></tr><tr><td align="left" valign="top">Palpation tenderness supple</td><td align="left" valign="top">1</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.03</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.06<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.96<break/>&#x00B1; 0.03<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.96<break/>&#x00B1; 0.02<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup></td><td align="left" valign="top">0.93<break/>&#x00B1; 0.07</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.94<break/>&#x00B1; 0.00</td><td align="left" valign="top">0.97<break/>&#x00B1; 0.07</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.00</td></tr><tr><td align="left" valign="top">Weighted average<sup><xref ref-type="table-fn" rid="table9fn12">l</xref></sup></td><td align="left" valign="top">45</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.81<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.91<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.94<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.86<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.93<break/>&#x00B1; 0.02</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.86<break/>&#x00B1; 0.05</td><td align="left" valign="top">0.96<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.98<break/>&#x00B1; 0.01</td><td align="left" valign="top">0.79<break/>&#x00B1; 0.06</td><td align="left" valign="top">0.95<break/>&#x00B1; 0.02</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr></tbody></table><table-wrap-foot><fn id="table9fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table9fn2"><p><sup>b</sup>ED: emergency department.</p></fn><fn id="table9fn3"><p><sup>c</sup>&#x0394; Accuracy columns show pairwise accuracy differences. &#x0394; values indicate improved accuracy of optimized prompts compared with minimal prompts or those of physicians.</p></fn><fn id="table9fn4"><p><sup>d</sup>Each class is evaluated as a separate binary classification (present vs absent), meaning the number of binary decisions equals twice the number of classes. </p></fn><fn id="table9fn5"><p><sup>e</sup>Spec.: specificity.</p></fn><fn id="table9fn6"><p><sup>f</sup>Sens.: sensitivity.</p></fn><fn id="table9fn7"><p><sup>g</sup>Acc.: accuracy.</p></fn><fn id="table9fn8"><p><sup>h</sup>Accuracy difference (&#x0394;) between minimal vs optimized prompts.</p></fn><fn id="table9fn9"><p><sup>i</sup>Accuracy difference (&#x0394;) between emergency department physician 1 vs optimized prompts.</p></fn><fn id="table9fn10"><p><sup>j</sup>Accuracy difference (&#x0394;) between emergency department physician 2 vs optimized prompts.</p></fn><fn id="table9fn11"><p><sup>k</sup>Optimized prompts were retained only when they improved performance on the development set; otherwise, the minimal prompt was used).</p></fn><fn id="table9fn12"><p><sup>l</sup>Weighted averages account for the number of classes per feature. CIs (95% CI) were estimated using bootstrapping.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>LLM vs Physician Labeling</title><p>Both physicians achieved higher weighted average specificity and accuracy on the evaluation set than the LLM under either prompting strategy. Physician 1 reached a specificity of 0.978 (95% CI &#x00B1;0.011), sensitivity of 0.859 (95% CI &#x00B1;0.015), and accuracy of 0.961 (95% CI &#x00B1;0.012), while physician 2 achieved a specificity of 0.980 (95% CI &#x00B1;0.008), sensitivity of 0.793 (95% CI &#x00B1;0.056), and accuracy of 0.951 (95% CI &#x00B1;0.015; <xref ref-type="table" rid="table2">Table 2</xref>).</p><p>At the feature level, the LLM using the optimized prompt matched or exceeded the accuracy of physician 1 for 8 of 16 features, and for physician 2 for 9 of 16. Notably, both physicians outperformed the LLM on the onset of pain, migration to the right lower quadrant, and pain manifestation. For the remaining features, physician performance was slightly higher, with marginal accuracy differences. All feature-level outputs for this comparison are available in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p></sec><sec id="s3-5"><title>HIVE Model Performance Using LLM vs Physicians&#x2019; Inputs</title><p>To compare predictive performance using LLM- vs physician-labeled input features, the HIVE model was evaluated for its ability to distinguish appendicitis from other AAP causes on the validation set (n=68; <xref ref-type="fig" rid="figure3">Figure 3</xref>). Using LLM-extracted features, the HIVE model achieved an AUROC of 0.871 (95% CI &#x00B1;0.019) with the minimal prompt, and 0.911 (95% CI &#x00B1;0.014) with the optimized prompt. In comparison, AUROCs were 0.917 (95% CI &#x00B1;0.015) using features labeled by physician 1, and 0.924 (95% CI &#x00B1;0.018) using those from physician 2.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>AUROC of the HIVE model for predicting appendicitis vs other AAP causes (ie, no appendicitis) in the validation set (n=68). (A,B) HIVE model performance using LLM-extracted features obtained through the minimal and optimized 0-shot prompting, respectively. (C,D) HIVE model performance using manually labeled features by ED physician 1 and physician 2, respectively. 95% CIs. AAP: acute abdominal pain; AUROC: area under the receiver operating characteristic curves; HIVE: History, Intake, Vitals, Examination; LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81500_fig03.png"/></fig><p>SHAP analysis demonstrated consistent feature importance across all input types (<xref ref-type="fig" rid="figure4">Figure 4</xref>). Features are displayed in descending order of their mean absolute SHAP contributions. The overall SHAP distributions were nearly identical, with the top 10 features cumulatively contributing 85% to 85.9% of the model output (full list in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). Although the minimal 0-shot prompt achieved a lower AUROC of 0.871 (95% CI &#x00B1;0.019), its feature importance pattern closely resembled those of the optimized and physician-labeled inputs.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Feature contributions to the HIVE model across 4 different input types on the validation set (n=68). Contributions of each feature to the model are SHAP (Shapley Additive Explanations) values scaled and plotted as percentage contributions to the prediction. (A,B) Feature contributions to the HIVE model performance using LLM-extracted features obtained through the minimal and optimized 0-shot prompting, respectively. (C,D) Feature contributions to the HIVE model performance using manually labeled features by ED physician 1 and physician 2, respectively. Abd.: abdominal; ED: emergency department; HIVE: History, Intake, Vitals, Examination; LLM: large language model; MAP: mean arterial pressure; MH: medical history; PE: physical examination; RLQ: right lower quadrant.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81500_fig04.png"/></fig><p>The random forest performed similarly to the XGBoost-based HIVE model, with AUROCs of 0.909 (95% CI &#x00B1;0.13) using the optimized prompt strategy and 0.883 (95% CI &#x00B1;0.13) using the minimal prompt strategy (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>).</p></sec><sec id="s3-6"><title>Error Analysis LLM</title><p>To examine misclassifications, 8 evaluation cases from the 4 lowest-performing features, pain manifestation, complaint development, pain onset, and abdominal-pain location, were reviewed to illustrate diverse error types.</p><p>The first 4 cases in <xref ref-type="fig" rid="figure5">Figure 5</xref> highlight challenges with multilabel outputs, where the LLM was expected to report a combination of applicable labels but instead provided only a single answer. Specifically, in cases 1 and 2, pain manifestation evolved from continuous to attack-wise, yet the LLM recognized only 1 label. Similarly, in cases 3 and 4, the LLM missed fluctuating symptom development over time. In <xref ref-type="fig" rid="figure6">Figure 6</xref>, cases 5 and 6 illustrate 2 reports lacking explicit mention of the onset of pain; the LLM inferred labels such as &#x201C;acute&#x201D; or &#x201C;gradual,&#x201D; whereas reference was &#x201C;not reported.&#x201D; In case 7, the ED report described a punctum maximum located 4 cm to the right of the umbilicus. While this was labeled as &#x201C;periumbilical&#x201D; in the reference standard, the LLM returned &#x201C;right lower quadrant.&#x201D; In case 8, despite instructions to report only the primary pain location, the LLM provided both &#x201C;entire right abdomen&#x201D; and &#x201C;diffuse&#x201D; for a case describing diffuse abdominal pain with pronounced tenderness in the right hemiabdomen.</p><p>Each panel presents a snippet from an ED report alongside the corresponding LLM output. Words highlighted in magenta within the ED report indicate incorrect (false negative or false positive) or missed interpretations by the LLM, while words in green represent correct (true positive) interpretations. On the right-hand side, the LLM&#x2019;s JSON output displays the unique patient identifier, the target clinical feature (magenta), and the extracted output (green). All prompts and ED reports were provided to the LLM in Dutch and have been translated into English for readability.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Case analysis of 8 ED reports across 4 features from the evaluation set. ED: emergency department; LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81500_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Case analysis of 8 ED reports across 4 features from the evaluation set. ED: emergency department; LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e81500_fig06.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>Conventional ML prediction models typically require structured input, posing a significant barrier to clinical integration, especially in EDs where physicians predominantly use free-text reports. This study demonstrates that a locally deployable LLM can automatically extract clinically relevant signs and symptoms from unstructured text, including binary, multiclass, and multilabel features. Using an optimized prompt strategy, the LLM reliably extracted features from free-text ED reports, enabling the appendicitis prediction model to achieve diagnostic performance comparable to that based on manually labeled features by ED physicians. This performance was consistent with that achieved in our previous study [<xref ref-type="bibr" rid="ref18">18</xref>], underscoring the reliability of the LLM in this context.</p><p>Numerous ML prediction models have relied on clinical signs and symptoms to support early disease detection and guide ED workflows. While NLP techniques are commonly used to structure this data, each comes with limitations. Conventional techniques, such as bag-of-words, term frequency-inverse document frequency, and phrase skip-gram, rely on word frequency or co-occurrence, with limited ability to capture contextual meaning [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. As a result, they often miss subtle language cues, such as symptom affirmation, negation, or uncertainty, which are essential in clinical interpretation. Topic modeling can uncover broad textual themes but may not align well with specific clinical concepts [<xref ref-type="bibr" rid="ref33">33</xref>]. Moreover, these techniques are unsupervised, meaning they do not rely on labeled data, which makes their output more difficult to interpret and control. Deep learning models used in this context (eg, long short-term memory or BERT) understand the meaning of words in a broader context [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref40">40</xref>] but require fine-tuning to handle domain-specific language, large annotated datasets, and are challenged by imbalanced data distributions, limiting their performance and generalizability in real-world settings [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>].</p><p>A small, locally hosted LLM for feature extraction can be deployed with modest requirements, offering advantages in data security and independence from external infrastructure. In comparison with conventional NLP or deep learning models, the LLM used in this study required neither extensive preprocessing nor fine-tuning nor large annotated datasets. Instead, it successfully extracted relevant features directly from raw text using relatively simple 0-shot prompting strategies. Importantly, the LLM was used solely for feature extraction; diagnostic predictions were generated by a separate ML prediction model. This modular approach preserves interpretability and statistical transparency, including calibrated probabilities and CIs. SHAP analysis confirmed that the model&#x2019;s internal reasoning remained stable across input types: AUROC differences primarily reflected feature extraction accuracy rather than alterations in model reasoning. A random forest benchmark produced similar AUROCs to the XGBoost algorithm, indicating that the predictive signal in the LLM-extracted features is consistent across ML algorithms and not dependent on a specific model choice. Although the approach is lightweight and easy to deploy, real-world use still requires seamless EHR integration and fast inference. Modern EHR systems increasingly provide vendor APIs and Python-based toolkits for AI integration, but embedding such workflows into routine EHR practice remains a significant challenge [<xref ref-type="bibr" rid="ref43">43</xref>]. Given the diagnostic complexity of appendicitis [<xref ref-type="bibr" rid="ref44">44</xref>-<xref ref-type="bibr" rid="ref46">46</xref>], this workflow could function as a safety net by flagging high-risk presentations and supporting diagnostic decisions. Larger multicenter evaluations, including physician performance with and without AI assistance, will be needed to determine real-world value.</p><p>The ablation study showed that adding specific, symptom-related context, including explanatory cues, domain-specific terminology, and negation or constraint handling, most consistently improved extraction performance. These elements enhanced the LLM&#x2019;s ability to interpret synonyms and abbreviations commonly used in Dutch ED reports (eg, &#x201C;misselijkheid,&#x201D; &#x201C;nausea,&#x201D; or &#x201C;N+&#x201D; for nausea present) and to distinguish related but distinct concepts (eg, ignore vomiting when assessing nausea). In contrast, broader context elements such as report type, AI persona, or section limitation contributed little to performance. The LLM generalized well across features, classes, and prevalence levels, without requiring balancing strategies, and the close agreement with physician annotations suggests that more complex prompting strategies (eg, few-shot or chain-of-thought) add limited value for this task.</p><p>Despite its strengths, error analysis revealed that the LLM struggled with multilabel features, particularly when symptoms evolve over time. Clearly and consistently described features such as nausea or stool consistency were extracted more reliably than the onset of pain, which the LLM may have inferred from the surrounding context. In some cases, the LLM failed to prioritize the most prominent symptom, possibly due to reliance on textual similarity over clinical meaning, as observed in determining abdominal pain location. The LLM occasionally returned descriptive terms that matched the ED report but not the prescribed answer options. Some discrepancies also reflected inherent ambiguities between LLM output and manual annotations. Importantly, the LLM did not hallucinate nonexistent features outside the instructed features and answer options. These findings highlight the potential of LLM-based feature extraction, but also have limitations in handling time discrepancies or ambiguity, which are common challenges in clinical narratives.</p><p>This study has several limitations. Only 1 LLM (Qwen 2.5:14B) was evaluated, and performance may differ across models with different architectures or training corpora. As LLMs evolve, future LLMs with improved language understanding or instruction-tuned capabilities may outperform current results and reduce the need for prompt optimization. Nonetheless, given the complexity of ED reports, including abbreviations, medical jargon, and ambiguous phrasing, a structured prompting framework, as presented in this study, will likely remain necessary to ensure consistency and clinically accurate outputs. This study focused on Dutch-language ED reports, reflecting the clinical and linguistic context of our setting. While the prompting framework may be transferable, prompt content must be tailored to language-specific clinical conventions [<xref ref-type="bibr" rid="ref47">47</xref>]. Translating Dutch reports would not serve as a valid proxy for cross-lingual evaluation because it measures translation quality rather than the LLM&#x2019;s ability to process native clinical text. Although most open-source multilingual LLMs perform best in English, Chinese, and Spanish [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>], the LLM in this study achieved strong results in Dutch, suggesting that carefully designed prompts can support effective use in underrepresented languages. Despite the single-center scope, the dataset spans 7 years and includes reports from many ED physicians with varying documentation styles, supporting robustness, but external validation across linguistic and clinical contexts is needed [<xref ref-type="bibr" rid="ref50">50</xref>]. Finally, this study focused on extracting signs and symptoms, not numeric values as laboratory results or vital signs, which are typically stored in structured EHR fields.</p><p>This study demonstrates that a locally deployable LLM can automatically extract clinical signs and symptoms from free-text Dutch ED reports with performance comparable to manual labeling by two ED physicians using a structured prompt framework, even in a less widely represented language. This approach has the potential to support scalable implementation of our ML prediction models across clinical contexts, without extensive human annotation or computational resources. Ultimately, it may inform future development of real-time integration of prediction models into clinical workflows, facilitating more scalable, transparent, and privacy-preserving decision-support systems.</p></sec></body><back><ack><p>ChatGPT 4o (OpenAI) was used to improve grammar and spelling. We would like to express our gratitude to Harm Geraerdts, MSc, for his contribution to the publication of this paper. We thank Kim Jie, PhD, MD, for contributions to the conceptual framework.</p></ack><notes><sec><title>Funding</title><p>The authors received financial support from Radboudumc, Jeroen Bosch Hospital and Health~Holland (LSHM20103) for this research, authorship, and publication of this paper. The funders had no involvement in this study&#x2019;s design, data collection, analysis, interpretation, or the writing of this paper.</p></sec><sec><title>Data Availability</title><p>Information and the raw data are available from the corresponding author upon reasonable request. The underlying code for the large language model data extraction tool, the HIVE (History, Intake, Vitals, Examination) model, as well as the saved HIVE model, is available in a figshare repository (10.6084/m9.figshare.28931030).</p></sec></notes><fn-group><fn fn-type="con"><p>All authors have accepted responsibility for the entire content of this paper and approved its submission. AS, PB, MR, and BvG initiated and designed this study. AS collected the patient data. AS and PB created the reference standard and designed the features; KJ (Kim Jie, MD, PhD) and RDoC reviewed these features for annotation in a previous study. LvdW and RDoC manually annotated the emergency department reports in the reader study. JSB contributed to the analysis of large language model&#x2013;based feature extraction. AS and PB developed and finetuned the prompts. AS analyzed the data and developed the History, Intake, Vitals, Examination prediction models for appendicitis prediction. All authors provided input and improved this paper. SK, RK, MR, and BvG supervised this project. AS wrote the paper with input from all authors.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AAP</term><def><p>acute abdominal pain</p></def></def-item><def-item><term id="abb2">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb3">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb4">BERT</term><def><p>Bidirectional Encoder Representations From Transformers</p></def></def-item><def-item><term id="abb5">ED</term><def><p>emergency department</p></def></def-item><def-item><term id="abb6">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb7">HIVE</term><def><p>History, Intake, Vitals, Examination</p></def></def-item><def-item><term id="abb8">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb9">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb10">SHAP</term><def><p>Shapley Additive Explanations</p></def></def-item><def-item><term id="abb11">TRIPOD</term><def><p>Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis</p></def></def-item><def-item><term id="abb12">XGBoost</term><def><p>Extreme Gradient Boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kachman</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Brennan</surname><given-names>I</given-names> </name><name name-style="western"><surname>Oskvarek</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Waseem</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pines</surname><given-names>JM</given-names> </name></person-group><article-title>How artificial intelligence could transform emergency care</article-title><source>Am J Emerg Med</source><year>2024</year><month>07</month><volume>81</volume><fpage>40</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.04.024</pub-id><pub-id pub-id-type="medline">38663302</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Toti</surname><given-names>G</given-names> </name><name name-style="western"><surname>Morley</surname><given-names>KI</given-names> </name><etal/></person-group><article-title>SemEHR: a general-purpose semantic search system to surface semantic data from clinical notes for tailored care, trial recruitment, and clinical research</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>05</month><day>1</day><volume>25</volume><issue>5</issue><fpage>530</fpage><lpage>537</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocx160</pub-id><pub-id pub-id-type="medline">29361077</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ben-Haim</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yosef</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rowand</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Combination of machine learning algorithms with natural language processing may increase the probability of bacteremia detection in the emergency department: a retrospective, big-data analysis of 94,482 patients</article-title><source>Digit Health</source><year>2024</year><volume>10</volume><fpage>20552076241277673</fpage><pub-id pub-id-type="doi">10.1177/20552076241277673</pub-id><pub-id pub-id-type="medline">39291149</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohanty</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Lekan</surname><given-names>D</given-names> </name><name name-style="western"><surname>McCoy</surname><given-names>TP</given-names> </name><name name-style="western"><surname>Jenkins</surname><given-names>M</given-names> </name><name name-style="western"><surname>Manda</surname><given-names>P</given-names> </name></person-group><article-title>Machine learning for predicting readmission risk among the frail: explainable AI for healthcare</article-title><source>Patterns (N Y)</source><year>2022</year><month>01</month><day>14</day><volume>3</volume><issue>1</issue><fpage>100395</fpage><pub-id pub-id-type="doi">10.1016/j.patter.2021.100395</pub-id><pub-id pub-id-type="medline">35079714</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>SH</given-names> </name></person-group><article-title>Natural language generation for electronic health records</article-title><source>NPJ Digit Med</source><year>2018</year><volume>1</volume><fpage>63</fpage><pub-id pub-id-type="doi">10.1038/s41746-018-0070-0</pub-id><pub-id pub-id-type="medline">30687797</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sonoo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Iwai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Inokuchi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gunshin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kitsuta</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nakajima</surname><given-names>S</given-names> </name></person-group><article-title>Embedded-structure template for electronic records affects patient note quality and management for emergency head injury patients: an observational pre and post comparison quality improvement study</article-title><source>Medicine (Baltimore)</source><year>2016</year><month>10</month><volume>95</volume><issue>40</issue><fpage>e5105</fpage><pub-id pub-id-type="doi">10.1097/MD.0000000000005105</pub-id><pub-id pub-id-type="medline">27749590</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moy</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Hobensack</surname><given-names>M</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Understanding the perceived role of electronic health records and workflow fragmentation on clinician documentation burden in emergency departments</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>04</month><day>19</day><volume>30</volume><issue>5</issue><fpage>797</fpage><lpage>808</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad038</pub-id><pub-id pub-id-type="medline">36905604</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Deep learning-based natural language processing for detecting medical symptoms and histories in emergency patient triage</article-title><source>Am J Emerg Med</source><year>2024</year><month>03</month><volume>77</volume><fpage>29</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2023.11.063</pub-id><pub-id pub-id-type="medline">38096637</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koleck</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Dreisbach</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bourne</surname><given-names>PE</given-names> </name><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name></person-group><article-title>Natural language processing of symptoms documented in free-text narratives of electronic health records: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2019</year><month>04</month><day>1</day><volume>26</volume><issue>4</issue><fpage>364</fpage><lpage>379</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy173</pub-id><pub-id pub-id-type="medline">30726935</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><comment>Preprint posted online on  May 24, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Rong</surname><given-names>R</given-names> </name><etal/></person-group><article-title>A critical assessment of using ChatGPT for extracting structured data from clinical notes</article-title><source>NPJ Digit Med</source><year>2024</year><month>05</month><day>1</day><volume>7</volume><issue>1</issue><fpage>106</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01079-8</pub-id><pub-id pub-id-type="medline">38693429</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bigolin Lanfredi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mukherjee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Summers</surname><given-names>RM</given-names> </name></person-group><article-title>Enhancing chest X-ray datasets with privacy-preserving large language models and multi-type annotations: a data-driven approach for improved classification</article-title><source>Med Image Anal</source><year>2025</year><month>01</month><volume>99</volume><fpage>103383</fpage><pub-id pub-id-type="doi">10.1016/j.media.2024.103383</pub-id><pub-id pub-id-type="medline">39546982</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dorfner</surname><given-names>FJ</given-names> </name><name name-style="western"><surname>J&#x00FC;rgensen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Donle</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Comparing commercial and open-source large language models for labeling chest radiograph reports</article-title><source>Radiology</source><year>2024</year><month>10</month><volume>313</volume><issue>1</issue><fpage>e241139</fpage><pub-id pub-id-type="doi">10.1148/radiol.241139</pub-id><pub-id pub-id-type="medline">39470431</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McMurry</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Phelan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dixon</surname><given-names>BE</given-names> </name><etal/></person-group><article-title>Large language model symptom identification from clinical text: multicenter study</article-title><source>J Med Internet Res</source><year>2025</year><month>07</month><day>31</day><volume>27</volume><fpage>e72984</fpage><pub-id pub-id-type="doi">10.2196/72984</pub-id><pub-id pub-id-type="medline">40743494</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bejan</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Reed</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Mikula</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language models improve the identification of emergency department visits for symptomatic kidney stones</article-title><source>Sci Rep</source><year>2025</year><month>01</month><day>28</day><volume>15</volume><issue>1</issue><fpage>3503</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-86632-5</pub-id><pub-id pub-id-type="medline">39875475</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A large language model based pipeline for extracting information from patient complaint and anamnesis in clinical notes for severity assessment</article-title><source>Sci Rep</source><year>2025</year><volume>15</volume><issue>1</issue><fpage>25345</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-07649-4</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hasan</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Tarannum</surname><given-names>P</given-names> </name><name name-style="western"><surname>Dey</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Do large language models speak all languages equally? a comparative study in low-resource settings</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 5, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2408.02237</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schipper</surname><given-names>A</given-names> </name><name name-style="western"><surname>Belgers</surname><given-names>P</given-names> </name><name name-style="western"><surname>O&#x2019;Connor</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Machine-learning based prediction of appendicitis for patients presenting with acute abdominal pain at the emergency department</article-title><source>World J Emerg Surg</source><year>2024</year><month>12</month><day>23</day><volume>19</volume><issue>1</issue><fpage>40</fpage><pub-id pub-id-type="doi">10.1186/s13017-024-00570-7</pub-id><pub-id pub-id-type="medline">39716296</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bosma</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>VY</given-names> </name><etal/></person-group><article-title>Finetuned language models are zero-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 8, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2109.01652</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>Mistral NeMo</article-title><source>Mistral AI</source><year>2024</year><month>07</month><day>18</day><access-date>2026-04-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://mistral.ai/news/mistral-nemo">https://mistral.ai/news/mistral-nemo</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Qwen</collab></person-group><article-title>Qwen2.5 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 3, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.15115</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>DeepSeek-AI</collab></person-group><article-title>DeepSeek-R1: incentivizing reasoning capability in LLMS via reinforcement learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 4, 2026</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.12948</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Gemma Team</collab></person-group><article-title>Gemma 2: improving open language models at a practical size</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 2, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2408.00118</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Blackwell</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Barry</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cohn</surname><given-names>AG</given-names> </name></person-group><article-title>Towards reproducible LLM evaluation: quantifying uncertainty in LLM benchmark scores</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 27, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.03492</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>LLM-data-extractor-from-reports-ollama-</article-title><source>GitHub</source><access-date>2026-04-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/aschipper/LLM-data-extractor-from-reports-ollama-">https://github.com/aschipper/LLM-data-extractor-from-reports-ollama-</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><article-title>Llm_extractinator-v0.4.2</article-title><source>Zenodo</source><access-date>2026-04-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://zenodo.org/records/15089764">https://zenodo.org/records/15089764</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>XGBoost documentation</article-title><source>DMLC XGBoost</source><access-date>2026-04-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://xgboost.readthedocs.io">https://xgboost.readthedocs.io</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prokhorenkova</surname><given-names>L</given-names> </name><name name-style="western"><surname>Gusev</surname><given-names>G</given-names> </name><name name-style="western"><surname>Vorobev</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dorogush</surname><given-names>AV</given-names> </name><name name-style="western"><surname>Gulin</surname><given-names>A</given-names> </name></person-group><article-title>CatBoost: unbiased boosting with categorical features</article-title><source>Proc 32nd Conf Neural Inf Process Syst NeurIPS 2018</source><year>2018</year><access-date>2026-04-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2018/file/14491b756b3a51daac41c24863285549-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2018/file/14491b756b3a51daac41c24863285549-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><source>CatBoost Encoder</source><access-date>2026-04-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://contrib.scikit-learn.org/category_encoders/catboost.html">https://contrib.scikit-learn.org/category_encoders/catboost.html</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Akiba</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sano</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yanase</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ohta</surname><given-names>T</given-names> </name><name name-style="western"><surname>Koyama</surname><given-names>M</given-names> </name></person-group><article-title>Optuna: a next-generation hyperparameter optimization framework (version 361)</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 25, 2019</comment><pub-id pub-id-type="doi">10.1145/3292500.3330701</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>IQVIA netherlands</article-title><comment><ext-link ext-link-type="uri" xlink:href="https://www.iqvia.com/nl-nl/locations/netherlands">https://www.iqvia.com/nl-nl/locations/netherlands</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>MY</given-names> </name><name name-style="western"><surname>Gustad</surname><given-names>LT</given-names> </name><name name-style="western"><surname>Nytr&#x00F8;</surname><given-names>&#x00D8;</given-names> </name></person-group><article-title>Sepsis prediction, early detection, and identification using clinical text for machine learning: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>01</month><day>29</day><volume>29</volume><issue>3</issue><fpage>559</fpage><lpage>575</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocab236</pub-id><pub-id pub-id-type="medline">34897469</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goh</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yeow</surname><given-names>AYK</given-names> </name><etal/></person-group><article-title>Artificial intelligence in sepsis early prediction and diagnosis using unstructured data in healthcare</article-title><source>Nat Commun</source><year>2021</year><month>01</month><day>29</day><volume>12</volume><issue>1</issue><fpage>711</fpage><pub-id pub-id-type="doi">10.1038/s41467-021-20910-4</pub-id><pub-id pub-id-type="medline">33514699</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hoogendoorn</surname><given-names>M</given-names> </name><name name-style="western"><surname>Szolovits</surname><given-names>P</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>LMG</given-names> </name><name name-style="western"><surname>Numans</surname><given-names>ME</given-names> </name></person-group><article-title>Utilizing uncoded consultation notes from electronic medical records for predictive modeling of colorectal cancer</article-title><source>Artif Intell Med</source><year>2016</year><month>05</month><volume>69</volume><fpage>53</fpage><lpage>61</lpage><pub-id pub-id-type="doi">10.1016/j.artmed.2016.03.003</pub-id><pub-id pub-id-type="medline">27085847</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ryoo</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A pediatric emergency prediction model using natural language process in the pediatric emergency department</article-title><source>Sci Rep</source><year>2025</year><month>01</month><day>28</day><volume>15</volume><issue>1</issue><fpage>3574</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-87161-x</pub-id><pub-id pub-id-type="medline">39875462</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nunez</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bates</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>RT</given-names> </name></person-group><article-title>Predicting the survival of patients with cancer from their initial oncology consultation document using natural language processing</article-title><source>JAMA Netw Open</source><year>2023</year><month>02</month><day>1</day><volume>6</volume><issue>2</issue><fpage>e230813</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.0813</pub-id><pub-id pub-id-type="medline">36848085</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shankar</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bundele</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mukhopadhyay</surname><given-names>A</given-names> </name></person-group><article-title>Natural language processing of electronic health records for early detection of cognitive decline: a systematic review</article-title><source>npj Digit Med</source><year>2025</year><month>03</month><day>1</day><volume>8</volume><issue>1</issue><fpage>133</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01527-z</pub-id><pub-id pub-id-type="medline">40025194</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Pacella</surname><given-names>CB</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name></person-group><article-title>Integrating structured and unstructured data for predicting emergency severity: an association and predictive study using transformer-based natural language processing models</article-title><source>BMC Med Inform Decis Mak</source><year>2024</year><month>12</month><day>5</day><volume>24</volume><issue>1</issue><fpage>372</fpage><pub-id pub-id-type="doi">10.1186/s12911-024-02793-9</pub-id><pub-id pub-id-type="medline">39633370</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Boonyarat</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>YC</given-names> </name></person-group><article-title>Clinical narrative-aware deep neural network for emergency department critical outcome prediction</article-title><source>J Biomed Inform</source><year>2023</year><month>02</month><volume>138</volume><fpage>104284</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2023.104284</pub-id><pub-id pub-id-type="medline">36632861</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iscoe</surname><given-names>M</given-names> </name><name name-style="western"><surname>Socrates</surname><given-names>V</given-names> </name><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Identifying signs and symptoms of urinary tract infection from emergency department clinical notes using large language models</article-title><source>Acad Emerg Med</source><year>2024</year><month>06</month><volume>31</volume><issue>6</issue><fpage>599</fpage><lpage>610</lpage><pub-id pub-id-type="doi">10.1111/acem.14883</pub-id><pub-id pub-id-type="medline">38567658</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey on clinical natural language processing in the United Kingdom from 2007 to 2022</article-title><source>npj Digit Med</source><year>2022</year><month>12</month><day>21</day><volume>5</volume><issue>1</issue><fpage>186</fpage><pub-id pub-id-type="doi">10.1038/s41746-022-00730-6</pub-id><pub-id pub-id-type="medline">36544046</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Japkowicz</surname><given-names>N</given-names> </name><name name-style="western"><surname>Stephen</surname><given-names>S</given-names> </name></person-group><article-title>The class imbalance problem: a systematic study1</article-title><source>IDA</source><year>2002</year><volume>6</volume><issue>5</issue><fpage>429</fpage><lpage>449</lpage><pub-id pub-id-type="doi">10.3233/IDA-2002-6504</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Masayoshi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hashimoto</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yokoyama</surname><given-names>R</given-names> </name><name name-style="western"><surname>Toda</surname><given-names>N</given-names> </name><name name-style="western"><surname>Uwamino</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fukuda</surname><given-names>S</given-names> </name><etal/></person-group><article-title>EHR-MCP: real-world evaluation of clinical information retrieval by large language models via model context protocol</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 19, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2509.15957</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mahajan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Basu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pai</surname><given-names>CW</given-names> </name><etal/></person-group><article-title>Factors associated with potentially missed diagnosis of appendicitis in the emergency department</article-title><source>JAMA Netw Open</source><year>2020</year><month>03</month><day>2</day><volume>3</volume><issue>3</issue><fpage>e200612</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.0612</pub-id><pub-id pub-id-type="medline">32150270</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Drake</surname><given-names>FT</given-names> </name><name name-style="western"><surname>Flum</surname><given-names>DR</given-names> </name></person-group><article-title>Improvement in the diagnosis of appendicitis</article-title><source>Adv Surg</source><year>2013</year><volume>47</volume><fpage>299</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1016/j.yasu.2013.03.003</pub-id><pub-id pub-id-type="medline">24298858</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gelpke</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hamminga</surname><given-names>JTH</given-names> </name><name name-style="western"><surname>van Bastelaar</surname><given-names>JJ</given-names> </name><etal/></person-group><article-title>Reducing the negative appendectomy rate with the laparoscopic appendicitis score; a multicenter prospective cohort and validation study</article-title><source>Int J Surg</source><year>2020</year><month>07</month><volume>79</volume><fpage>257</fpage><lpage>264</lpage><pub-id pub-id-type="doi">10.1016/j.ijsu.2020.04.041</pub-id><pub-id pub-id-type="medline">32387211</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rosenbloom</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Denny</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lorenzi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Stead</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>KB</given-names> </name></person-group><article-title>Data from clinical notes: a perspective on the tension between structure and flexible documentation</article-title><source>J Am Med Inf Assoc</source><year>2011</year><volume>18</volume><issue>2</issue><fpage>181</fpage><lpage>186</lpage><pub-id pub-id-type="doi">10.1136/jamia.2010.007237</pub-id><pub-id pub-id-type="medline">21233086</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nemkova</surname><given-names>P</given-names> </name><name name-style="western"><surname>Adhikari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pearson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sadu</surname><given-names>VK</given-names> </name><name name-style="western"><surname>Albert</surname><given-names>MV</given-names> </name></person-group><article-title>Cross-lingual stability and bias in instruction-tuned language models for humanitarian NLP</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 26, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2510.22823</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Vanroy</surname><given-names>B</given-names> </name></person-group><article-title>Language resources for Dutch large language modelling</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2312.12852</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Intrator</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Halfon</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goldenberg</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Breaking the language barrier: can direct inference outperform pre-translation in multilingual LLM applications?</article-title><source>Proc 2024 Conf North Am Chapter Assoc Comput Linguist</source><year>2024</year><fpage>829</fpage><lpage>844</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.naacl-short.75</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Development of structured features from free-text ED (emergency department) reports using researcher annotations.</p><media xlink:href="medinform_v14i1e81500_app1.docx" xlink:title="DOCX File, 34 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Feature contributions to the HIVE (History, Intake, Vitals, Examination) model from the previous study&#x2019;s validation set (n=68).</p><media xlink:href="medinform_v14i1e81500_app2.docx" xlink:title="DOCX File, 44 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Elements evaluated in the prompt ablation study for each feature.</p><media xlink:href="medinform_v14i1e81500_app3.xlsx" xlink:title="XLSX File, 20 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>All outputs from the LLM (large language model) and ED (emergency department) physicians&#x2019; annotations (n=236).</p><media xlink:href="medinform_v14i1e81500_app4.xlsx" xlink:title="XLSX File, 109 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Full list of feature contributions using SHAP (Shapley Additive Explanations) values scaled as percentage contributions to the HIVE (History, Intake, Vitals, Examination) model on the validation set (n=68).</p><media xlink:href="medinform_v14i1e81500_app5.docx" xlink:title="DOCX File, 39 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>AUROC (area under the receiver operating characteristic curves ) of the HIVE (History, Intake, Vitals, Examination) random forest model for predicting appendicitis vs other AAP (acute abdominal pain) causes in the validation set (n=68).</p><media xlink:href="medinform_v14i1e81500_app6.docx" xlink:title="DOCX File, 120 KB"/></supplementary-material><supplementary-material id="app7"><label>Checklist 1</label><p>TRIPOD Checklist.</p><media xlink:href="medinform_v14i1e81500_app7.docx" xlink:title="DOCX File, 67 KB"/></supplementary-material></app-group></back></article>