<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e65454</article-id><article-id pub-id-type="doi">10.2196/65454</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Classifying Unstructured Text in Electronic Health Records for Mental Health Prediction Models: Large Language Model Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Cardamone</surname><given-names>Nicholas C</given-names></name><degrees>MSEd</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Olfson</surname><given-names>Mark</given-names></name><degrees>MD, MPH</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schmutte</surname><given-names>Timothy</given-names></name><degrees>PsyD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ungar</surname><given-names>Lyle</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Tony</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cullen</surname><given-names>Sara W</given-names></name><degrees>MSW, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Williams</surname><given-names>Nathaniel J</given-names></name><degrees>PhD, LCSW</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Marcus</surname><given-names>Steven C</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Psychiatry, Perelman School of Medicine, University of Pennsylvania</institution><addr-line>Philadelphia</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Psychiatry, the New York State Psychiatric Institute</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Psychiatry, Yale School of Medicine</institution><addr-line>New Haven</addr-line><addr-line>CT</addr-line><country>United States</country></aff><aff id="aff4"><institution>Computer and Information Science, University of Pennsylvania</institution><addr-line>Philadelphia</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff5"><institution>School of Social Policy &#x0026; Practice, University of Pennsylvania</institution><addr-line>Philadelphia</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff6"><institution>School of Social Work, Boise State University</institution><addr-line>Boise</addr-line><addr-line>ID</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Thies</surname><given-names>Bill</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chung</surname><given-names>Philip</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Nicholas C Cardamone, MSEd, Department of Psychiatry, Perelman School of Medicine, University of Pennsylvania, 3535 Market Street, Philadelphia, PA, 19104, United States, 1 2158800568; <email>nicholas.cardamone@va.gov</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>21</day><month>1</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e65454</elocation-id><history><date date-type="received"><day>15</day><month>08</month><year>2024</year></date><date date-type="rev-recd"><day>25</day><month>11</month><year>2024</year></date><date date-type="accepted"><day>30</day><month>11</month><year>2024</year></date></history><copyright-statement>&#x00A9; Nicholas C Cardamone, Mark Olfson, Timothy Schmutte, Lyle Ungar, Tony Liu, Sara W Cullen, Nathaniel J Williams, Steven C Marcus. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 21.1.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e65454"/><abstract><sec><title>Background</title><p>Prediction models have demonstrated a range of applications across medicine, including using electronic health record (EHR) data to identify hospital readmission and mortality risk. Large language models (LLMs) can transform unstructured EHR text into structured features, which can then be integrated into statistical prediction models, ensuring that the results are both clinically meaningful and interpretable.</p></sec><sec><title>Objective</title><p>This study aims to compare the classification decisions made by clinical experts with those generated by a state-of-the-art LLM, using terms extracted from a large EHR data set of individuals with mental health disorders seen in emergency departments (EDs).</p></sec><sec sec-type="methods"><title>Methods</title><p>Using a dataset from the EHR systems of more than 50 health care provider organizations in the United States from 2016 to 2021, we extracted all clinical terms that appeared in at least 1000 records of individuals admitted to the ED for a mental health&#x2013;related problem from a source population of over 6 million ED episodes. Two experienced mental health clinicians (one medically trained psychiatrist and one clinical psychologist) reached consensus on the classification of EHR terms and diagnostic codes into categories. We evaluated an LLM&#x2019;s agreement with clinical judgment across three classification tasks as follows: (1) classify terms into &#x201C;mental health&#x201D; or &#x201C;physical health&#x201D;, (2) classify mental health terms into 1 of 42 prespecified categories, and (3) classify physical health terms into 1 of 19 prespecified broad categories.</p></sec><sec sec-type="results"><title>Results</title><p>There was high agreement between the LLM and clinical experts when categorizing 4553 terms as &#x201C;mental health&#x201D; or &#x201C;physical health&#x201D; (&#x03BA;=0.77, 95% CI 0.75-0.80). However, there was still considerable variability in LLM-clinician agreement on the classification of mental health terms (&#x03BA;=0.62, 95% CI 0.59&#x2010;0.66) and physical health terms (&#x03BA;=0.69, 95% CI 0.67&#x2010;0.70).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The LLM displayed high agreement with clinical experts when classifying EHR terms into certain mental health or physical health term categories. However, agreement with clinical experts varied considerably within both sets of mental and physical health term categories. Importantly, the use of LLMs presents an alternative to manual human coding, presenting great potential to create interpretable features for prediction models.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>machine learning</kwd><kwd>ML</kwd><kwd>natural language processing</kwd><kwd>NLP</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>ChatGPT</kwd><kwd>predictive modeling</kwd><kwd>mental health</kwd><kwd>health informatics</kwd><kwd>electronic health record</kwd><kwd>EHR</kwd><kwd>EHR system</kwd><kwd>text</kwd><kwd>dataset</kwd><kwd>mental health disorder</kwd><kwd>emergency department</kwd><kwd>physical health</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Recent advances in health informatics have led to the development of machine learning models that are trained on data from electronic health records (EHRs). These models have proven to be effective across a range of health domains, including predicting the spread of disease [<xref ref-type="bibr" rid="ref1">1</xref>], hospital readmission rates [<xref ref-type="bibr" rid="ref2">2</xref>], and suicide risk [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Predictive models have been implemented in EHR systems to identify high-risk patients and alert clinicians to critical health events [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>EHR systems are filled with unstructured text data, including clinical notes and discharge summaries, which are not easily categorized into clinically interpretable groupings for use in predictive models. Although the use of this data can greatly enhance prediction model performance and the interpretability of decision-support tools [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], the coding process is labor intensive and requires expert consultation and extensive training [<xref ref-type="bibr" rid="ref8">8</xref>]. These challenges hinder the development and scalability of clinical prediction models that incorporate unstructured EHR data [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Large language models (LLMs), like OpenAI&#x2019;s GPT models, can streamline the classification and coding of unstructured EHR text due to their massive training data sets and advanced text processing [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. LLMs have been used to categorize unstructured text from EHR systems [<xref ref-type="bibr" rid="ref13">13</xref>], assist with qualitative analysis [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>], and perform deductive coding with and without context [<xref ref-type="bibr" rid="ref16">16</xref>]. Preliminary evidence shows that LLMs outperform crowd workers in annotation of health texts [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>The reliability of LLMs in replicating clinical judgment for coding classification tasks in mental health remains uncertain, particularly given the inherent complexities of mental health disorders [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Prior research highlights that while LLMs can process large volumes of text, their ability to discern subtle differences in clinical presentations, such as differentiating between comorbid conditions like depression and anxiety, is still unproven. This challenge is exacerbated by the frequent overlap of symptoms across diagnoses, which complicates classification efforts [<xref ref-type="bibr" rid="ref21">21</xref>]. Patients with mental health disorders may present with unique clinical characteristics that challenge an LLM&#x2019;s ability to accurately identify and code physical and mental health symptoms [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>We used a large EHR data set of individuals admitted to the emergency department (ED) for a mental health disorder, to assess the ability of a state-of-the-art LLM to classify EHR terms into categories defined by experienced mental health clinicians. We assessed the extent to which a LLM replicates clinical judgment and the practicality of using a LLM to assist in creating clinically interpretable features for prediction models.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data</title><p>We extracted de-identified EHR data from the Optum Labs Data Warehouse, a longitudinal, real-world data asset, from &#x003E;50 US healthcare provider organizations that encompass &#x003E;700 hospitals. We included individuals aged &#x2265;10 years who were admitted to the ED from 2016 to 2021 and had an International Classification of Disease-9 or -10 code for a mental health diagnosis, suicidal ideation, or self-harm, resulting in approximately 6.2 million unique patient episodes. A natural language processing (NLP) algorithm integrated into the Optum Labs Data Warehouse extracted from unstructured free-text fields in the EHR, clinical terms for signs, symptoms, and diseases based on the National Library of Medicine&#x2019;s Unified Medical Language System dictionary. We identified physical and mental health terms that appeared in at least 1000 unique patient episodes.</p></sec><sec id="s2-2"><title>Coding</title><p>A board-certified psychiatrist and licensed clinical psychologist categorized each EHR term into 1 of 61 categories including 42 mental health-related categories and 19 physical health-related categories which were generated from the Clinical Classifications Software Refined [<xref ref-type="bibr" rid="ref24">24</xref>] and the International Classification of Disease-10 diagnosis coding system, respectively. Coding each EHR term involved: (1) initial classification by 1 clinician coder, (2) a review of all coding decisions by a second clinician coder with suggestions for revisions; (3) a final consensus reconciliation involving both coders. The coding of physical health terms was supported by an LLM, which suggested coding decisions that were refined and reconciled (5% of terms required reconciliation) by the 2 clinician coders. All study procedures were approved by the Institutional Review Board of University of Pennsylvania.</p></sec><sec id="s2-3"><title>Classification Tasks</title><p>We used the Python module &#x201C;openai&#x201D; [<xref ref-type="bibr" rid="ref25">25</xref>] to run the GPT-4 LLM in a Python environment. We used OpenAI&#x2019;s most sophisticated GPT-4 that was then publicly available (&#x201C;gpt-4-turbo-2024-04-09&#x201D;) and set model parameters to maximize output consistency (eg, temperature=0).</p><p>We prompted the model with 3 &#x201C;zero-shot&#x201D; classification tasks, wherein the model is provided codes without examples: (1) classify all (n=4553) EHR terms as either &#x201C;mental health&#x201D; or &#x201C;physical health,&#x201D; (2) classify each of the (n=846) mental health terms into 1 of the 42 mental health categories, and (3) classify each of the (n=3707) physical health terms into 1 of the 19 physical health categories. The prompt described the task, listed the possible categories, and provided the EHR terms. The model then confirmed that the predicted category was among the list of possible categories. For full reproducibility, the complete prompt provided to the model, including the task description and category list, is detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In task 2, the model was given an unstructured clinical term from an EHR such as &#x201C;depressive symptoms.&#x201D; Then, the prompt described the classification task and provided the following list of possible mental health categories (eg, &#x201C;depression,&#x201D; &#x201C;anxiety,&#x201D; &#x201C;eating disorder symptoms,&#x201D; and &#x201C;substance use&#x201D;). The process was repeated for all 846 mental health terms, and similarly for the 3707 physical health terms in task 3.</p></sec><sec id="s2-4"><title>Performance Metrics</title><p>We compared GPT-4&#x2019;s predicted categories with the categories determined by clinical judgment using the Python library scikit-learn &#x201C;metrics&#x201D; module [<xref ref-type="bibr" rid="ref26">26</xref>]. For each task, we report the overall Cohen &#x03BA; and weighted average of precision, recall, and <italic>F</italic><sub>1</sub>-score, accounting for label imbalance. We computed 95% CIs for Cohen &#x03BA;, precision, recall, and <italic>F</italic><sub>1</sub>-score using a bootstrap procedure with 1000 resamples [<xref ref-type="bibr" rid="ref27">27</xref>].</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>Ethical approval (IRB Protocol #848806) for this study was waived by the University of Pennsylvania Institutional Review Board via 45 CFR 46.104, category 4.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>EHR terms (n=4553) were categorized by GPT as &#x201C;mental health&#x201D; or &#x201C;physical health.&#x201D; Overall, classification performance was strong with &#x03BA; of 0.77 (95% CI 0.75-0.80), precision of 0.93 (95% CI 0.92&#x2010;0.94), recall of 0.93 (95% CI 0.92&#x2010;0.94), and <italic>F</italic><sub>1</sub>-score of 0.93 (95% CI 0.92&#x2010;0.94). The GPT-4 classified 18.3% (n=833) of the EHR terms as &#x201C;mental health&#x201D; and 81.7% (n=3720) as &#x201C;physical health&#x201D; (<xref ref-type="table" rid="table1">Table 1</xref>). The clinician coders and model disagreed on the categorization of 164 (19.7%) mental health terms (eg, &#x201C;gunshot wound,&#x201D; &#x201C;chronic fatigue syndrome,&#x201D; and &#x201C;IV drug use&#x201D;) and 149 (4%) physical health terms (eg, &#x201C;activity issues,&#x201D; &#x201C;lethargic,&#x201D; and &#x201C;food issues&#x201D;).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Recall, <italic>F</italic><sub>1</sub>-score, and total mentions among terms in the data set across health domains.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Health domain (n)</td><td align="left" valign="bottom">Recall (95% CI)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="bottom">Total mentions in data set (thousands)</td></tr></thead><tbody><tr><td align="left" valign="top">Physical health (n=3707)</td><td align="left" valign="top">0.96 (0.95&#x2010;0.97)</td><td align="left" valign="top">0.96 (0.95&#x2010;0.96)</td><td align="left" valign="top">255,573</td></tr><tr><td align="left" valign="top">Mental health (n=846)</td><td align="left" valign="top">0.81 (0.78&#x2010;0.83)</td><td align="left" valign="top">0.81 (0.79&#x2010;0.83)</td><td align="left" valign="top">85,081</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Recall indicates the proportion of terms in a clinician-coded category that were classified by the model as belonging to that category.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Mental Health</title><p>Mental health terms (n=846) were classified into 42 categories with &#x03BA; of 0.62 (95% CI 0.59-0.66), precision of 0.71 (95% CI 0.68&#x2010;0.74), recall of 0.64 (95% CI 0.61&#x2010;0.68), and <italic>F</italic><sub>1</sub>-score of 0.65 (95% CI 0.62&#x2010;0.69). <xref ref-type="table" rid="table2">Table 2</xref> includes category-wise recall, <italic>F</italic><sub>1</sub>-score, and a set of the most frequent categories into which terms from the true category were misclassified (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Mental health term categories: recall, <italic>F</italic><sub>1</sub>-score, total mentions in the dataset, and most common misclassification (in descending order of recall). Categories with &#x003C;5 terms were excluded.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Term category (n)</td><td align="left" valign="bottom">Recall (95% CI)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="bottom">Total mentions in dataset (thousands)</td><td align="left" valign="bottom">Misclassifications (n)</td></tr></thead><tbody><tr><td align="left" valign="top">Eating disorder or symptoms (n=16)</td><td align="left" valign="top">1 (0.81&#x2010;1)</td><td align="left" valign="top">0.91 (0.80&#x2010;1)</td><td align="left" valign="top">582</td><td align="left" valign="top">None</td></tr><tr><td align="left" valign="top">Living situation (n=11)</td><td align="left" valign="top">1 (0.74&#x2010;1)</td><td align="left" valign="top">1 (1&#x2010;1)</td><td align="left" valign="top">1259</td><td align="left" valign="top">None</td></tr><tr><td align="left" valign="top">ADHD<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> spectrum (n=11)</td><td align="left" valign="top">1 (0.74&#x2010;1)</td><td align="left" valign="top">0.73 (0.52-0.88)</td><td align="left" valign="top">810</td><td align="left" valign="top">None</td></tr><tr><td align="left" valign="top">OCD<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> symptoms or disorder (n=10)</td><td align="left" valign="top">1 (0.72&#x2010;1)</td><td align="left" valign="top">0.87 (0.67&#x2010;1)</td><td align="left" valign="top">207</td><td align="left" valign="top">None</td></tr><tr><td align="left" valign="top">Somatization symptoms (n=6)</td><td align="left" valign="top">1 (0.61&#x2010;1)</td><td align="left" valign="top">0.86 (0.57&#x2010;1)</td><td align="left" valign="top">62</td><td align="left" valign="top">None</td></tr><tr><td align="left" valign="top">Neurocognitive disorders (n=20)</td><td align="left" valign="top">0.95 (0.76&#x2010;0.99)</td><td align="left" valign="top">0.62 (0.47&#x2010;0.75)</td><td align="left" valign="top">1225</td><td align="left" valign="top">Neurocognitive symptoms (n=1)</td></tr><tr><td align="left" valign="top">Sleep wake symptoms or disorder (n=37)</td><td align="left" valign="top">0.95 (0.82&#x2010;0.99)</td><td align="left" valign="top">0.86 (0.78&#x2010;0.94)</td><td align="left" valign="top">1833</td><td align="left" valign="top">Miscellaneous psychiatric symptoms (n=1) and depressive symptoms (n=1)</td></tr><tr><td align="left" valign="top">Substance-related symptoms or disorder (n=90)</td><td align="left" valign="top">0.92 (0.85&#x2010;0.96)</td><td align="left" valign="top">0.95 (0.91&#x2010;0.98)</td><td align="left" valign="top">8783</td><td align="left" valign="top">Neurocognitive disorders (n=4), neurocognitive symptoms (n=1), and psychotic symptoms or disorder (n=1)</td></tr><tr><td align="left" valign="top">Abusive behavior (n=26)</td><td align="left" valign="top">0.89 (0.71&#x2010;0.96)</td><td align="left" valign="top">0.84 (0.71&#x2010;0.93)</td><td align="left" valign="top">3053</td><td align="left" valign="top">Aggressive symptoms (n=1), miscellaneous psychiatric symptoms (n=1), and personality disorder (n=1)</td></tr><tr><td align="left" valign="top">Unipolar depressive disorder (n=8)</td><td align="left" valign="top">0.88 (0.53&#x2010;0.98)</td><td align="left" valign="top">0.78 (0.50&#x2010;0.96)</td><td align="left" valign="top">944</td><td align="left" valign="top">Mood disorder (n=1)</td></tr><tr><td align="left" valign="top">Autism spectrum disorder (n=7)</td><td align="left" valign="top">0.86 (0.49&#x2010;0.97)</td><td align="left" valign="top">0.71 (0.38&#x2010;0.92)</td><td align="left" valign="top">132</td><td align="left" valign="top">Mood disorder (n=1)</td></tr><tr><td align="left" valign="top">Impulsive behavior (n=6)</td><td align="left" valign="top">0.83 (0.44&#x2010;0.97)</td><td align="left" valign="top">0.83 (0.50&#x2010;1)</td><td align="left" valign="top">414</td><td align="left" valign="top">Aggressive symptoms (n=1)</td></tr><tr><td align="left" valign="top">Personality disorder (n=5)</td><td align="left" valign="top">0.80 (0.38&#x2010;0.96)</td><td align="left" valign="top">0.47 (0.11&#x2010;0.73)</td><td align="left" valign="top">158</td><td align="left" valign="top">OCD symptoms or disorder (n=1)</td></tr><tr><td align="left" valign="top">Injury (n=76)</td><td align="left" valign="top">0.78 (0.67&#x2010;0.88)</td><td align="left" valign="top">0.84 (0.77&#x2010;0.90)</td><td align="left" valign="top">10,470</td><td align="left" valign="top">Self harm (n=8), miscellaneous psychiatric symptoms (n=3), and stress-related symptoms or disorder (n=2)</td></tr><tr><td align="left" valign="top">Psychotic symptoms or disorder (n=50)</td><td align="left" valign="top">0.76 (0.63&#x2010;0.86)</td><td align="left" valign="top">0.76 (0.66&#x2010;0.85)</td><td align="left" valign="top">6074</td><td align="left" valign="top">Miscellaneous psychiatric symptoms (n=5), neurocognitive symptoms (n=2), and impulsive behavior (n=1)</td></tr><tr><td align="left" valign="top">Stress-related symptoms or disorder (n=11)</td><td align="left" valign="top">0.73 (0.43&#x2010;0.90)</td><td align="left" valign="top">0.57 (0.32&#x2010;0.77)</td><td align="left" valign="top">480</td><td align="left" valign="top">Stressor symptoms (n=2) and anxiety symptoms (n=1)</td></tr><tr><td align="left" valign="top">Anxiety disorder (n=14)</td><td align="left" valign="top">0.71 (0.45&#x2010;0.88)</td><td align="left" valign="top">0.71 (0.50&#x2010;0.90)</td><td align="left" valign="top">683</td><td align="left" valign="top">Anxiety symptoms (n=1), social situation (n=1), and somatization symptoms (n=1)</td></tr><tr><td align="left" valign="top">Suicidal symptoms (n=12)</td><td align="left" valign="top">0.67 (0.39&#x2010;0.86)</td><td align="left" valign="top">0.73 (0.46&#x2010;0.92)</td><td align="left" valign="top">6167</td><td align="left" valign="top">Self-harm (n=3) and psychotic symptoms or disorder (n=1)</td></tr><tr><td align="left" valign="top">Self-harm (n=12)</td><td align="left" valign="top">0.67 (0.39&#x2010;0.86)</td><td align="left" valign="top">0.47 (0.23&#x2010;0.67)</td><td align="left" valign="top">2126</td><td align="left" valign="top">Abusive behavior (n=3) and suicidal symptoms (n=1)</td></tr><tr><td align="left" valign="top">Anxiety symptoms (n=22)</td><td align="left" valign="top">0.64 (0.43&#x2010;0.80)</td><td align="left" valign="top">0.54 (0.36&#x2010;0.69)</td><td align="left" valign="top">7481</td><td align="left" valign="top">Stress-related symptoms or disorder (n=2), sensory disturbances (n=2), and anxiety disorder (n=2)</td></tr><tr><td align="left" valign="top">Neurocognitive symptoms (n=74)</td><td align="left" valign="top">0.61 (0.49&#x2010;0.71)</td><td align="left" valign="top">0.61 (0.50&#x2010;0.69)</td><td align="left" valign="top">1802</td><td align="left" valign="top">Neurocognitive disorders (n=10), miscellaneous psychiatric symptoms (n=8), and ADHD spectrum (n=6)</td></tr><tr><td align="left" valign="top">Aggressive symptoms (n=24)</td><td align="left" valign="top">0.58 (0.40&#x2010;0.76)</td><td align="left" valign="top">0.58 (0.40&#x2010;0.74)</td><td align="left" valign="top">4275</td><td align="left" valign="top">Anxiety symptoms (n=4), mood symptoms (n=4), and miscellaneous psychiatric symptoms (n=2)</td></tr><tr><td align="left" valign="top">Depressive symptoms (n=39)</td><td align="left" valign="top">0.56 (0.41&#x2010;0.71)</td><td align="left" valign="top">0.68 (0.54&#x2010;0.80)</td><td align="left" valign="top">6381</td><td align="left" valign="top">Mood symptoms (n=5), miscellaneous psychiatric symptoms (n=3), and unipolar depressive disorder (n=2)</td></tr><tr><td align="left" valign="top">Pharm symptoms (n=7)</td><td align="left" valign="top">0.43 (0.16&#x2010;0.75)</td><td align="left" valign="top">0.33 (0&#x2010;0.59)</td><td align="left" valign="top">699</td><td align="left" valign="top">Sensory disturbances (n=2), psych ADE<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> (n=1), and miscellaneous psychiatric symptoms (n=1)</td></tr><tr><td align="left" valign="top">Bipolar spectrum (n=36)</td><td align="left" valign="top">0.42 (0.27&#x2010;0.58)</td><td align="left" valign="top">0.59 (0.40&#x2010;0.74)</td><td align="left" valign="top">2290</td><td align="left" valign="top">Mood symptoms (n=18), psychotic symptoms or disorder (n=2), and miscellaneous psychiatric symptoms (n=1)</td></tr><tr><td align="left" valign="top">Miscellaneous psychiatric symptoms (n=156)</td><td align="left" valign="top">0.29 (0.22&#x2010;0.36)</td><td align="left" valign="top">0.39 (0.30&#x2010;0.46)</td><td align="left" valign="top">9554</td><td align="left" valign="top">Neurocognitive symptoms (n=17), antisocial behavior (n=10), and mood symptoms (n=10)</td></tr><tr><td align="left" valign="top">Suicidal behavioral (n=12)</td><td align="left" valign="top">0.25 (0.09&#x2010;0.53)</td><td align="left" valign="top">0.38 (0&#x2010;0.67)</td><td align="left" valign="top">1164</td><td align="left" valign="top">Injury (n=3), miscellaneous psychiatric symptoms (n=1), and overdose (n=1)</td></tr><tr><td align="left" valign="top">Antisocial behavior (n=10)</td><td align="left" valign="top">0.20 (0.06&#x2010;0.51)</td><td align="left" valign="top">0.17 (0&#x2010;0.37)</td><td align="left" valign="top">1666</td><td align="left" valign="top">Personality disorder (n=3), aggressive symptoms (n=2), and miscellaneous psychiatric symptoms (n=2)</td></tr><tr><td align="left" valign="top">Sensory disturbances (n=6)</td><td align="left" valign="top">0.17 (0.03&#x2010;0.56)</td><td align="left" valign="top">0.09 (0&#x2010;0.27)</td><td align="left" valign="top">387</td><td align="left" valign="top">Psychotic symptoms or disorder (n=3) and miscellaneous psychiatric symptoms (n=2)</td></tr><tr><td align="left" valign="top">Stressor symptoms (n=5)</td><td align="left" valign="top">0 (0&#x2010;0.43)</td><td align="left" valign="top">0 (0&#x2010;0)</td><td align="left" valign="top">34</td><td align="left" valign="top">Sensory disturbances (n=2), personality disorder (n=2), and miscellaneous psychiatric symptoms (n=1)</td></tr><tr><td align="left" valign="top">Psych ADE (n=11)</td><td align="left" valign="top">0 (0&#x2010;0.26)</td><td align="left" valign="top">0 (0&#x2010;0)</td><td align="left" valign="top">151</td><td align="left" valign="top">Neurocognitive symptoms (n=6) and pharm symptoms (n=5)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Recall indicates the proportion of terms in a clinician-coded category that were classified by the model as belonging to that category.</p></fn><fn id="table2fn2"><p><sup>b</sup>ADHD: attention deficit hyperactive disorder.</p></fn><fn id="table2fn3"><p><sup>c</sup>OCD: obsessive compulsive disorder.</p></fn><fn id="table2fn4"><p><sup>d</sup>psych ADE: psychiatric adverse drugs events.</p></fn></table-wrap-foot></table-wrap><p>The model exhibited the best classification performance for categories of: &#x201C;living situation&#x201D; (<italic>F</italic><sub>1</sub>-score=1, n=11 terms), &#x201C;substance use related symptoms and disorder&#x201D; (<italic>F</italic><sub>1</sub>-score=0.94, n=90 terms), &#x201C;eating disorder or symptoms&#x201D; (<italic>F</italic><sub>1</sub>-score=0.95, n=16 terms), &#x201C;OCD symptoms or disorder&#x201D; (<italic>F</italic><sub>1</sub>-score=0.87, n=10 terms), and &#x201C;sleep wake symptoms or disorder&#x201D; (<italic>F</italic><sub>1</sub>-score=0.86, n=37 terms). Conversely, the model performed poorly on &#x201C;miscellaneous psychiatric symptoms&#x201D; (<italic>F</italic><sub>1</sub>-score=0.39, n=156 terms), &#x201C;antisocial behavior&#x201D; (<italic>F</italic><sub>1</sub>-score=0.17, n=10 terms), &#x201C;sensory disturbances&#x201D; (<italic>F</italic><sub>1</sub>-score=0.09, n=10 terms), &#x201C;psychiatric adverse drug events&#x201D; (<italic>F</italic><sub>1</sub>-score=0, n=11 terms), and &#x201C;stressor symptoms&#x201D; (<italic>F</italic><sub>1</sub>-score=0, n=5 terms).</p><p>The most mislabeled mental health terms included &#x201C;psychiatric adverse drug events&#x201D; as &#x201C;neurocognitive symptoms&#x201D; (n=6 misclassifications) or &#x201C;pharmacological symptoms&#x201D; (n=5 misclassifications). The model also commonly mislabeled terms in &#x201C;miscellaneous psychiatric symptoms.&#x201D; There were 111 terms in the &#x201C;miscellaneous psychiatric symptoms&#x201D; category that were misclassified across 28 of 41 other categories (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p></sec><sec id="s3-3"><title>Physical Health</title><p>Physical health terms (n=3707) were classified into 19 categories with &#x03BA; of 0.69 (95% CI 0.67-0.70), precision of 0.76 (95% CI 0.74&#x2010;0.77), recall of 0.71 (95% CI 0.70&#x2010;0.73), and <italic>F</italic><sub>1</sub>-score of 0.72 (95% CI 0.70&#x2010;0.73). <xref ref-type="table" rid="table3">Table 3</xref> includes category-wise recall, <italic>F</italic><sub>1</sub>-score, and a set of the most frequent categories into which terms from the true category were misclassified (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Physical health term categories: recall, <italic>F</italic><sub>1</sub>-score, total mentions in the dataset, and most common misclassification (in descending order of recall).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Term category (n)</td><td align="left" valign="bottom">Recall (95% CI)<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="bottom">Total mentions in dataset (thousands)</td><td align="left" valign="bottom">Most frequent misclassifications (n)</td></tr></thead><tbody><tr><td align="left" valign="top">Oncological conditions (n=45)</td><td align="left" valign="top">0.91 (0.79&#x2010;0.96)</td><td align="left" valign="top">0.61 (0.51&#x2010;0.70)</td><td align="left" valign="top">4549</td><td align="left" valign="top">Autoimmune and inflammatory conditions (n=1), gastrointestinal symptoms (n=1), and other physical symptoms and conditions (n=1)</td></tr><tr><td align="left" valign="top">Sensory problems (n=41)</td><td align="left" valign="top">0.90 (0.78&#x2010;0.96)</td><td align="left" valign="top">0.35 (0.27&#x2010;0.43)</td><td align="left" valign="top">3113</td><td align="left" valign="top">Neurological symptoms (n=4)</td></tr><tr><td align="left" valign="top">Cardiovascular symptoms (n=401)</td><td align="left" valign="top">0.88 (0.85&#x2010;0.91)</td><td align="left" valign="top">0.88 (0.85&#x2010;0.90)</td><td align="left" valign="top">30,930</td><td align="left" valign="top">Other physical symptoms and conditions (n=14), neurological symptoms (n=9), and respiratory symptoms (n=8)</td></tr><tr><td align="left" valign="top">Respiratory symptoms (n=139)</td><td align="left" valign="top">0.84 (0.77&#x2010;0.89)</td><td align="left" valign="top">0.72 (0.66&#x2010;0.77)</td><td align="left" valign="top">27,775</td><td align="left" valign="top">Sensory problems (n=6), gastrointestinal symptoms (n=5), and other physical symptoms and conditions (n=4)</td></tr><tr><td align="left" valign="top">Infectious symptoms (n=145)</td><td align="left" valign="top">0.84 (0.77&#x2010;0.89)</td><td align="left" valign="top">0.63 (0.57&#x2010;0.68)</td><td align="left" valign="top">15,079</td><td align="left" valign="top">Hepatobiliary conditions (n=7), sensory problems (n=3), and skin and soft tissue disorders (n=3)</td></tr><tr><td align="left" valign="top">Metabolic disorders (n=63)</td><td align="left" valign="top">0.84 (0.73&#x2010;0.91)</td><td align="left" valign="top">0.68 (0.59&#x2010;0.76)</td><td align="left" valign="top">3136</td><td align="left" valign="top">Hepatobiliary conditions (n=7), endocrine symptoms (n=1), and other physical symptoms and conditions (n=1)</td></tr><tr><td align="left" valign="top">Hematological symptoms (n=122)</td><td align="left" valign="top">0.83 (0.75&#x2010;0.89)</td><td align="left" valign="top">0.81 (0.75&#x2010;0.86)</td><td align="left" valign="top">6321</td><td align="left" valign="top">Oncological conditions (n=11), gastrointestinal symptoms (n=3), and hepatobiliary conditions (n=3)</td></tr><tr><td align="left" valign="top">Neurological symptoms (n=413)</td><td align="left" valign="top">0.82 (0.78&#x2010;0.85)</td><td align="left" valign="top">0.79 (0.76&#x2010;0.82)</td><td align="left" valign="top">22,540</td><td align="left" valign="top">Sensory problems (n=38), other physical symptoms and conditions (n=8), and infectious symptoms (n=5)</td></tr><tr><td align="left" valign="top">Gastrointestinal symptoms (n=279)</td><td align="left" valign="top">0.81 (0.76&#x2010;0.85)</td><td align="left" valign="top">0.77 (0.72&#x2010;0.81)</td><td align="left" valign="top">24,878</td><td align="left" valign="top">Hepatobiliary conditions (n=18), autoimmune and inflammatory conditions (n=10), and infectious symptoms (n=9)</td></tr><tr><td align="left" valign="top">Skin and soft tissue disorders (n=314)</td><td align="left" valign="top">0.78 (0.73&#x2010;0.82)</td><td align="left" valign="top">0.80 (0.76&#x2010;0.83)</td><td align="left" valign="top">15,212</td><td align="left" valign="top">Infectious symptoms (n=26), other physical symptoms and conditions (n=13), and gastrointestinal symptoms (n=9)</td></tr><tr><td align="left" valign="top">Genitourinary symptoms (n=201)</td><td align="left" valign="top">0.77 (0.71&#x2010;0.82)</td><td align="left" valign="top">0.81 (0.76&#x2010;0.85)</td><td align="left" valign="top">8571</td><td align="left" valign="top">Gastrointestinal symptoms (n=12), infectious symptoms (n=11), and other physical symptoms and conditions (n=7)</td></tr><tr><td align="left" valign="top">Renal disorders (n=52)</td><td align="left" valign="top">0.75 (0.62&#x2010;0.85)</td><td align="left" valign="top">0.76 (0.65&#x2010;0.84)</td><td align="left" valign="top">2221</td><td align="left" valign="top">Infectious symptoms (n=5), genitourinary symptoms (n=4), and cardiovascular symptoms (n=3)</td></tr><tr><td align="left" valign="top">Endocrine symptoms (n=98)</td><td align="left" valign="top">0.67 (0.58&#x2010;0.76)</td><td align="left" valign="top">0.71 (0.63&#x2010;0.78)</td><td align="left" valign="top">4942</td><td align="left" valign="top">Metabolic disorders (n=16), sensory problems (n=4), and autoimmune and inflammatory conditions (n=3)</td></tr><tr><td align="left" valign="top">Musculoskeletal symptoms (n=480)</td><td align="left" valign="top">0.67 (0.63&#x2010;0.71)</td><td align="left" valign="top">0.79 (0.75&#x2010;0.82)</td><td align="left" valign="top">21,785</td><td align="left" valign="top">Other physical symptoms and conditions (n=62), neurological symptoms (n=39), and autoimmune and inflammatory conditions (n=13)</td></tr><tr><td align="left" valign="top">Pain symptoms (n=59)</td><td align="left" valign="top">0.59 (0.47&#x2010;0.71)</td><td align="left" valign="top">0.61 (0.51&#x2010;0.71)</td><td align="left" valign="top">18,045</td><td align="left" valign="top">Other physical symptoms and conditions (n=6), neurological symptoms (n=5), and gastrointestinal symptoms (n=4)</td></tr><tr><td align="left" valign="top">Autoimmune and inflammatory conditions (n=68)</td><td align="left" valign="top">0.54 (0.43&#x2010;0.66)</td><td align="left" valign="top">0.50 (0.40&#x2010;0.60)</td><td align="left" valign="top">6234</td><td align="left" valign="top">Infectious symptoms (n=9), other physical symptoms and conditions (n=9), and skin and soft tissue disorders (n=4)</td></tr><tr><td align="left" valign="top">Hepatobiliary conditions (n=54)</td><td align="left" valign="top">0.54 (0.41&#x2010;0.66)</td><td align="left" valign="top">0.45 (0.33&#x2010;0.56)</td><td align="left" valign="top">1970</td><td align="left" valign="top">Gastrointestinal symptoms (n=11), cardiovascular symptoms (n=4), and other physical symptoms and conditions (n=3)</td></tr><tr><td align="left" valign="top">Other physical symptoms and conditions (n=559)</td><td align="left" valign="top">0.47 (0.42&#x2010;0.51)</td><td align="left" valign="top">0.54 (0.50&#x2010;0.58)</td><td align="left" valign="top">31,151</td><td align="left" valign="top">Sensory problems (n=68), neurological symptoms (n=39), and skin and soft tissue disorders (n=29)</td></tr><tr><td align="left" valign="top">Respiratory disorders (n=173)</td><td align="left" valign="top">0.40 (0.33&#x2010;0.47)</td><td align="left" valign="top">0.55 (0.48&#x2010;0.63)</td><td align="left" valign="top">7120</td><td align="left" valign="top">Respiratory symptoms (n=50), infectious symptoms (n=28), and other physical symptoms and conditions (n=10)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Recall indicates the proportion of terms in a clinician-coded category that were classified by the model as belonging to that category.</p></fn></table-wrap-foot></table-wrap><p>The model exhibited the best classification performance for categories of: &#x201C;cardiovascular symptoms&#x201D; (n=401 terms), &#x201C;hematological symptoms&#x201D; (n=122 terms), and &#x201C;genitourinary symptoms&#x201D; (n=201 terms), with recall and <italic>F</italic><sub>1</sub>-score values &#x003E;0.80. Conversely, the model performed poorly on &#x201C;sensory problems&#x201D; (<italic>F</italic><sub>1</sub>-score=0.35, n=41 terms), &#x201C;hepatobiliary conditions&#x201D; (<italic>F</italic><sub>1</sub>-score=0.45, n=54 terms), and &#x201C;other physical symptoms and conditions&#x201D; (<italic>F</italic><sub>1</sub>-score=0.54, n=559 terms).</p><p>The model commonly predicted the category &#x201C;sensory problems&#x201D; in terms of the categories &#x201C;other physical symptoms and conditions&#x201D; (n=68 misclassifications) and &#x201C;neurological symptoms&#x201D; (n=38 misclassifications). The model also commonly mislabeled &#x201C;other physical symptoms and conditions.&#x201D; There were 299 &#x201C;other physical symptoms and conditions&#x201D; terms that were misclassified across 18 other categories (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>We investigated a GPT-4&#x2019;s ability to replicate clinical judgment when classifying EHR terms from a dataset of mental health patients into interpretable clinical categories. A recent review of NLP studies found the agreement of human coding of EHR data to range from 0.72 to 0.94 (Cohen &#x03BA;) [<xref ref-type="bibr" rid="ref28">28</xref>]. Based on this benchmark, GPT-4 showcases human-like agreement with clinical experts when classifying EHR terms as either mental or physical health. Yet, GPT-4&#x2019;s classification performance varied widely across mental health and physical health categories and had high error rates for certain categories (eg, &#x201C;sensory problems&#x201D; and &#x201C;stressor symptoms&#x201D;). Misclassifications highlighted GPT-4&#x2019;s biases, such as the tendency for broad categories (eg, &#x201C;other physical symptoms and conditions&#x201D;) to be underselected. Instead, terms from these categories were allocated to more specific categories (eg, &#x201C;cutting&#x201D; was allocated to &#x201C;injury&#x201D; instead of &#x201C;self-harm&#x201D;).</p><p>Nevertheless, GPT-4 was able to rapidly transform a feature set of 4553 individual EHR terms into 61 clinically valid groups which can be readily implemented into prediction models. State-of-the-art LLMs have already been used alongside traditional NLP methods, such as named entity recognition, text clustering, and supervised machine learning models trained on text data [<xref ref-type="bibr" rid="ref29">29</xref>-<xref ref-type="bibr" rid="ref31">31</xref>]. Additionally, LLMs can explain categorization decisions, providing valuable insights for end users of integrated clinical tools.</p></sec><sec id="s4-2"><title>Limitations</title><p>LLMs occasionally &#x201C;hallucinate&#x201D;, generating outputs that are off-task, nonsensical, or contradictory. Although we prompted the model to validate the output and correct for hallucinations, as the creativity and complexity of tasks increase so does the risk of aberrant outputs [<xref ref-type="bibr" rid="ref32">32</xref>]. Moreover, recent studies have found that LLM performance on certain clinical tasks can substantially improve when given 1 or multiple examples for codes, a process known as &#x201C;few-shot&#x201D; learning [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. In contrast, our study used &#x201C;zero-shot&#x201D; learning, where GPT-4 was asked to classify clinical terms without being provided with any specific examples or definitions for the coding system. This method was chosen to assess the model&#x2019;s baseline classification performance, without introducing any more task-specific bias. However, we recognize that because the coding system was developed by only 2 clinicians, bias may be introduced due to their unique sets of clinical experiences, institutional practices, and personal preferences. The LLM may be biased as well. An ad hoc analysis indicated a tendency for the model to underuse &#x201C;other&#x201D; categories (eg, &#x201C;other physical symptoms and conditions&#x201D; and &#x201C;miscellaneous psychiatric symptoms&#x201D;) relative to clinician coders (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Nonetheless, we acknowledge that many clinical terms in EHR are inherently ambiguous and may be classified under multiple categories depending on the context. Without knowing the sample is among people hospitalized with a mental health disturbance, it is not necessarily a misclassification for GPT-4 to label &#x201C;gunshot wound&#x201D; as a physical injury and not an indicator of suicide. The task of assigning a single, mutually exclusive label may limit one&#x2019;s ability to capture the full complexity of the clinical term. While this study provides a preliminary framework for exploring the feasibility of using LLMs for unstructured EHR classification, future research should aim to involve a varied set of coding methods, classification approaches (eg, multi-label classification), and a larger cohort of clinician-coders to enhance generalizability. Finally, we note that several categories in the mental health domain had too few terms (&#x003C;5) to yield stable estimates of agreement and were removed from the analysis.</p></sec><sec id="s4-3"><title>Implications</title><p>The accuracy of clinical term classification is essential for downstream predictive models that rely on structured data, as inaccuracies can propagate through the model pipeline. Understanding the sensitivity of these models to variations in input labels is key, especially when distinguishing between random errors and systematic misclassifications. Systematic errors, where specific categories are consistently mislabeled, may significantly affect the robustness of models trained on such data, potentially more so than a random error (ie, noise) [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]. Moreover, the assumption that accurate categorization of clinical terms is a necessary intermediate step is worth reconsidering. As LLMs advance, there is potential for these models to bypass the traditional 2-stage process and make direct predictions from unstructured text [<xref ref-type="bibr" rid="ref30">30</xref>]. Future research is needed to determine whether bypassing the intermediate categorization step entirely might enhance or hinder model performance, depending on the specific clinical application.</p></sec><sec id="s4-4"><title>Conclusion</title><p>As LLMs continue to advance, the time and human resources required to distill a large corpus of EHR terms into clinically meaningful groups can be greatly reduced. LLMs have the potential to be integrated into EHR systems to create text-based features for prediction models in real time. This study found that a state-of-the-art LLM achieved high agreement with classifications of experienced clinicians across terms from numerous physical and mental health categories.</p></sec></sec></body><back><ack><p>This work was supported by the National Institutes of Health (R01MH126895). We appreciate the contribution of Ming Xie in dataset preparation and extensive analytical support.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ED</term><def><p>emergency department</p></def></def-item><def-item><term id="abb2">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">NLP</term><def><p>natural language processing</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hossain</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Moni</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Uddin</surname><given-names>S</given-names> </name></person-group><article-title>Use of electronic health data for disease prediction: a comprehensive literature review</article-title><source>IEEE/ACM Trans Comput Biol Bioinform</source><year>2021</year><volume>18</volume><issue>2</issue><fpage>745</fpage><lpage>758</lpage><pub-id pub-id-type="doi">10.1109/TCBB.2019.2937862</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mahmoudi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kamdar</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gonzales</surname><given-names>G</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Waljee</surname><given-names>AK</given-names> </name></person-group><article-title>Use of electronic medical records in development and validation of risk prediction models of hospital readmission: systematic review</article-title><source>BMJ</source><year>2020</year><month>04</month><day>8</day><volume>369</volume><fpage>m958</fpage><pub-id pub-id-type="doi">10.1136/bmj.m958</pub-id><pub-id pub-id-type="medline">32269037</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boudreaux</surname><given-names>ED</given-names> </name><name name-style="western"><surname>Haskins</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Larkin</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Emergency department safety assessment and follow-up evaluation 2: an implementation trial to improve suicide prevention</article-title><source>Contemp Clin Trials</source><year>2020</year><month>08</month><volume>95</volume><fpage>106075</fpage><pub-id pub-id-type="doi">10.1016/j.cct.2020.106075</pub-id><pub-id pub-id-type="medline">32565041</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boudreaux</surname><given-names>ED</given-names> </name><name name-style="western"><surname>Rundensteiner</surname><given-names>E</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Applying machine learning approaches to suicide prediction using healthcare data: overview and future directions</article-title><source>Front Psychiatry</source><year>2021</year><volume>12</volume><fpage>707916</fpage><pub-id pub-id-type="doi">10.3389/fpsyt.2021.707916</pub-id><pub-id pub-id-type="medline">34413800</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>NU</given-names> </name><name name-style="western"><surname>Haack</surname><given-names>A</given-names> </name><name name-style="western"><surname>Baxter</surname><given-names>SL</given-names> </name></person-group><article-title>Clinical implementation of predictive models embedded within electronic health record systems: a systematic review</article-title><source>Informatics (MDPI)</source><year>2020</year><month>09</month><volume>7</volume><issue>3</issue><fpage>25</fpage><pub-id pub-id-type="doi">10.3390/informatics7030025</pub-id><pub-id pub-id-type="medline">33274178</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bayramli</surname><given-names>I</given-names> </name><name name-style="western"><surname>Castro</surname><given-names>V</given-names> </name><name name-style="western"><surname>Barak-Corren</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Predictive structured-unstructured interactions in EHR models: a case study of suicide prediction</article-title><source>NPJ Digit Med</source><year>2022</year><month>01</month><day>27</day><volume>5</volume><issue>1</issue><fpage>15</fpage><pub-id pub-id-type="doi">10.1038/s41746-022-00558-0</pub-id><pub-id pub-id-type="medline">35087182</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mahajan</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Ghani</surname><given-names>R</given-names> </name></person-group><article-title>Combining structured and unstructured data for predicting risk of readmission for heart failure patients</article-title><source>Stud Health Technol Inform</source><year>2019</year><month>08</month><day>21</day><volume>264</volume><fpage>238</fpage><lpage>242</lpage><pub-id pub-id-type="doi">10.3233/SHTI190219</pub-id><pub-id pub-id-type="medline">31437921</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Coiera</surname><given-names>E</given-names> </name></person-group><source>Guide to Health Informatics</source><year>2015</year><edition>3</edition><publisher-name>CRC Press</publisher-name></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Akbilgic</surname><given-names>O</given-names> </name><name name-style="western"><surname>Homayouni</surname><given-names>R</given-names> </name><name name-style="western"><surname>Heinrich</surname><given-names>K</given-names> </name><name name-style="western"><surname>Langham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>R</given-names> </name></person-group><article-title>Unstructured text in EMR improves prediction of death after surgery in children</article-title><source>Informatics (MDPI)</source><year>2019</year><volume>6</volume><issue>1</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.3390/informatics6010004</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marafino</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Park</surname><given-names>M</given-names> </name><name name-style="western"><surname>Davies</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Validation of prediction models for critical care outcomes using natural language processing of electronic health record data</article-title><source>JAMA Netw Open</source><year>2018</year><month>12</month><day>7</day><volume>1</volume><issue>8</issue><fpage>e185097</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2018.5097</pub-id><pub-id pub-id-type="medline">30646310</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bousselham</surname><given-names>H</given-names> </name><name name-style="western"><surname>Nfaoui</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Mourhir</surname><given-names>A</given-names> </name></person-group><article-title>Fine-tuning GPT on biomedical NLP tasks: an empirical evaluation</article-title><conf-name>2024 International Conference on Computer, Electrical &#x0026; Communication Engineering (ICCECE)</conf-name><conf-date>Feb 2-3, 2024</conf-date><conf-loc>Kolkata, India</conf-loc><pub-id pub-id-type="doi">10.1109/ICCECE58645.2024.10497313</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Shekhar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tiwari</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rensink</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Eskander</surname><given-names>R</given-names> </name><name name-style="western"><surname>Salloum</surname><given-names>W</given-names> </name></person-group><article-title>Coupling symbolic reasoning with language modeling for efficient longitudinal understanding of unstructured electronic medical records</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 7, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.03360</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Chew</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bollenbacher</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wenger</surname><given-names>M</given-names> </name><name name-style="western"><surname>Speer</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>A</given-names> </name></person-group><article-title>LLM-assisted content analysis: using large language models to support deductive coding</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 23, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.14924</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>QV</given-names> </name><name name-style="western"><surname>Abdelghani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Oudeyer</surname><given-names>PY</given-names> </name></person-group><article-title>Supporting qualitative analysis with large language models: combining codebook with GPT-3 for deductive coding</article-title><conf-name>IUI &#x2019;23: 28th International Conference on Intelligent User Interfaces</conf-name><conf-date>Mar 27-31, 2023</conf-date><conf-loc>Sydney, Australia</conf-loc><pub-id pub-id-type="doi">10.1145/3581754.3584136</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hou</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Prompt-based and fine-tuned GPT models for context-dependent and -independent deductive coding in social annotation</article-title><conf-name>LAK &#x2019;24: The 14th Learning Analytics and Knowledge Conference</conf-name><conf-date>Mar 18-22, 2024</conf-date><conf-loc>Kyoto, Japan</conf-loc><pub-id pub-id-type="doi">10.1145/3636555.3636910</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilardi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Alizadeh</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kubli</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT outperforms crowd workers for text-annotation tasks</article-title><source>Proc Natl Acad Sci U S A</source><year>2023</year><month>07</month><day>25</day><volume>120</volume><issue>30</issue><fpage>e2305016120</fpage><pub-id pub-id-type="doi">10.1073/pnas.2305016120</pub-id><pub-id pub-id-type="medline">37463210</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>T&#x00F6;rnberg</surname><given-names>P</given-names> </name></person-group><article-title>ChatGPT-4 outperforms experts and crowd workers in annotating political twitter messages with zero-shot learning</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 13, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.06588</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Hua</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Large language models in mental health care: a scoping review</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 1, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.02984</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bedi</surname><given-names>P</given-names> </name></person-group><article-title>Towards optimal NLP solutions: analyzing GPT and LLaMA-2 models across model scale, dataset size, and task diversity</article-title><source>Eng Technol Appl Sci Res</source><year>2024</year><volume>14</volume><issue>3</issue><fpage>14219</fpage><lpage>14224</lpage><pub-id pub-id-type="doi">10.48084/etasr.7200</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Yanagita</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yokokawa</surname><given-names>D</given-names> </name><name name-style="western"><surname>Fukuzawa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Uchida</surname><given-names>S</given-names> </name><name name-style="western"><surname>Uehara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ikusaka</surname><given-names>M</given-names> </name></person-group><article-title>Assessing the ability of GPT to generate illness scripts: an evaluation study</article-title><source>medRxiv</source><comment>Preprint posted online on  Dec 27, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.12.25.23300525</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Moradi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Blagec</surname><given-names>K</given-names> </name><name name-style="western"><surname>Haberl</surname><given-names>F</given-names> </name><name name-style="western"><surname>Samwald</surname><given-names>M</given-names> </name></person-group><article-title>GPT-3 models are poor few-shot learners in the biomedical domain</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 6, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2109.02555</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the utility of chatgpt throughout the entire clinical workflow</article-title><source>medRxiv</source><comment>Preprint posted online on  Feb 26, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.02.21.23285886</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>User guide: clinical classifications software refined (CCSR)</article-title><source>Agency for Healthcare Research and Quality Healthcare Cost and Utilization Project (HCUP)</source><year>2019</year><access-date>2025-01-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://hcup-us.ahrq.gov/toolssoftware/ccsr/DXCCSR-User-Guide-v2019-1.pdf">https://hcup-us.ahrq.gov/toolssoftware/ccsr/DXCCSR-User-Guide-v2019-1.pdf</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>openai/openai-python: the official Python library for the OpenAI API</article-title><source>GitHub</source><year>2024</year><access-date>2025-01-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/openai/openai-python">https://github.com/openai/openai-python</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kramer</surname><given-names>O</given-names> </name></person-group><source>Scikit-Learn: Machine Learning for Evolution Strategies</source><year>2016</year><access-date>2025-01-15</access-date><publisher-name>Springer International Publishing</publisher-name><fpage>45</fpage><lpage>53</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://link.springer.com/10.1007/978-3-319-33383-0_5">http://link.springer.com/10.1007/978-3-319-33383-0_5</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fung</surname><given-names>KP</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name></person-group><article-title>Bootstrap estimate of the variance and confidence interval of kappa</article-title><source>Br J Ind Med</source><year>1991</year><month>07</month><volume>48</volume><issue>7</issue><fpage>503</fpage><lpage>504</lpage><pub-id pub-id-type="doi">10.1136/oem.48.7.503</pub-id><pub-id pub-id-type="medline">1854654</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scharp</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hobensack</surname><given-names>M</given-names> </name><name name-style="western"><surname>Davoudi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Topaz</surname><given-names>M</given-names> </name></person-group><article-title>Natural language processing applied to clinical documentation in post-acute care settings: a scoping review</article-title><source>J Am Med Dir Assoc</source><year>2024</year><month>01</month><volume>25</volume><issue>1</issue><fpage>69</fpage><lpage>83</lpage><pub-id pub-id-type="doi">10.1016/j.jamda.2023.09.006</pub-id><pub-id pub-id-type="medline">37838000</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garrido-Merchan</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Gozalo-Brizuela</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gonzalez-Carvajal</surname><given-names>S</given-names> </name></person-group><article-title>Comparing BERT against traditional machine learning models in text classification</article-title><source>J Comput Cogn Eng</source><year>2023</year><volume>2</volume><fpage>352</fpage><lpage>356</lpage></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>A scoping review of using large language models (LLMs) to investigate electronic health records (EHRs)</article-title><source>arxiv</source><year>2024</year><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2405.03066">https://arxiv.org/abs/2405.03066</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Sushil</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zack</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mandair</surname><given-names>D</given-names> </name><etal/></person-group><article-title>A comparative study of zero-shot inference with large language models and supervised modeling in breast cancer pathology classification</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.13887</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>M</given-names> </name></person-group><article-title>A mathematical investigation of hallucination and creativity in GPT models</article-title><source>Mathematics</source><year>2023</year><volume>11</volume><issue>10</issue><fpage>2320</fpage><pub-id pub-id-type="doi">10.3390/math11102320</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Labrak</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rouvier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dufour</surname><given-names>R</given-names> </name></person-group><article-title>A zero-shot and few-shot study of instruction-finetuned large language models applied to clinical and biomedical tasks</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 22, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.12114</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kelley</surname><given-names>M</given-names> </name><name name-style="western"><surname>Samolyk-Mazzanti</surname><given-names>A</given-names> </name><name name-style="western"><surname>Visweswaran</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>An empirical evaluation of prompting strategies for large language models in zero-shot clinical natural language processing: algorithm development and validation study</article-title><source>JMIR Med Inform</source><year>2024</year><month>04</month><day>8</day><volume>12</volume><fpage>e55318</fpage><pub-id pub-id-type="doi">10.2196/55318</pub-id><pub-id pub-id-type="medline">38587879</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schwartz</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Hovy</surname><given-names>D</given-names> </name></person-group><article-title>Predictive biases in natural language processing models: a conceptual framework and overview</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 9, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1912.11078</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Guan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name></person-group><article-title>Towards trustworthy LLMs: a review on debiasing and dehallucinating in large language models</article-title><source>Artif Intell Rev</source><year>2024</year><volume>57</volume><issue>9</issue><fpage>243</fpage><pub-id pub-id-type="doi">10.1007/s10462-024-10896-y</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Frenay</surname><given-names>B</given-names> </name><name name-style="western"><surname>Verleysen</surname><given-names>M</given-names> </name></person-group><article-title>Classification in the presence of label noise: a survey</article-title><source>IEEE Trans Neural Netw Learning Syst</source><year>2014</year><volume>25</volume><issue>5</issue><fpage>845</fpage><lpage>869</lpage><pub-id pub-id-type="doi">10.1109/TNNLS.2013.2292894</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Code and prompt design.</p><media xlink:href="medinform_v13i1e65454_app1.docx" xlink:title="DOCX File, 23 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Classification performance metrics output.</p><media xlink:href="medinform_v13i1e65454_app2.xlsx" xlink:title="XLSX File, 76 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Analysis of misclassifications.</p><media xlink:href="medinform_v13i1e65454_app3.xlsx" xlink:title="XLSX File, 67 KB"/></supplementary-material></app-group></back></article>