<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e73020</article-id><article-id pub-id-type="doi">10.2196/73020</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Predicting 30-Days Hospital Readmission for Patients with Heart Failure Using Electronic Health Record Embeddings: Comparative Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Shakya</surname><given-names>Prabin</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Khaneja</surname><given-names>Ayush</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wagholikar</surname><given-names>Kavishwar B</given-names></name><degrees>PhD, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Laboratory of Computer Science, Massachusetts General Hospital</institution><addr-line>399 Revolution Drive, 7th Floor</addr-line><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Abiye</surname><given-names>Alfoalem Araba</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Liu</surname><given-names>Sibei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Prabin Shakya, MSc, Laboratory of Computer Science, Massachusetts General Hospital, 399 Revolution Drive, 7th Floor, Boston, MA, 02145, United States, 1 8595360114; <email>prabinrs@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>25</day><month>11</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e73020</elocation-id><history><date date-type="received"><day>23</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Prabin Shakya, Ayush Khaneja, Kavishwar B Wagholikar. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 25.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e73020"/><abstract><sec><title>Background</title><p>Heart failure (HF) is a public health concern with a wider impact on quality of life and cost of care. One of the major challenges in HF is the higher rate of unplanned readmissions and suboptimal performance of models to predict the readmissions. Hence, in this study, we implemented embeddings-based approaches to generate features for improving model performance.</p></sec><sec><title>Objective</title><p>The objective of this study was to evaluate and compare the effectiveness of different feature embedding approaches for improving the prediction of unplanned readmissions in patients with heart failure.</p></sec><sec sec-type="methods"><title>Methods</title><p>We compared three embedding approaches including <italic>word2vec</italic> on terminology codes and concept unique identifier (CUIs) and BERT on descriptive text of concept with baseline (one hot-encoding). We compared area under the receiver operating characteristic (AUROC) and <italic>F</italic><sub>1</sub>-scores for the logistic regression, eXtream gradient-boosting (XGBoost) and artificial neural network (ANN) models using these embedding approaches. The model was tested on the heart failure cohort (N=21,031) identified using least restrictive phenotyping methods from MIMIC-IV dataset.</p></sec><sec sec-type="results"><title>Results</title><p>We found that the embedding approaches significantly improved the performance of the prediction models. The XGBoost performed better for all approaches. The <italic>word2vec</italic> embeddings (0.65) trained on the dataset outperformed embeddings from pre-trained BERT model (0.59) using descriptive text.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Embedding methods, particularly <italic>word2vec</italic> trained on electronic health record data, can better discriminate HF readmission cases compared to both one-hot encoding and pre-trained BERT embeddings on concept descriptions making it a viable approach of automation feature selection. The observed AUROC improvement (0.65 vs 0.54) may support more effective risk stratification and targeted clinical interventions.</p></sec></abstract><kwd-group><kwd>heart failure</kwd><kwd>machine learning</kwd><kwd>embedding techniques</kwd><kwd>electronic health record</kwd><kwd>EHR</kwd><kwd>hospital readmission</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Heart Failure (HF) is a major and growing public health concern, affecting around 3% of the adult population of high-income countries, with increasing prevalence in low and middle-income countries [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. It is estimated that 56.2 million people worldwide are affected by HF [<xref ref-type="bibr" rid="ref2">2</xref>]. HF is also associated with high rates of unplanned readmissions, which strain the health care system and diminish patient quality of life. The 1-year readmission rate is as high as 53% globally and 59% in the United States [<xref ref-type="bibr" rid="ref3">3</xref>]. Given the significant burden imposed by readmissions, healthcare providers strive to identify patients with HF at risk for readmission. Several predictive models have been developed for this purpose, including traditional statistical models and machine learning models. However, their predictive performance remains suboptimal in real-world applications, with area under the receiver operating characteristic curve (AUROC) of 0.6 when using electronic health record (EHR) data [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Medical codes such as International Classification of Diseases (ICD) codes, procedure codes, and medication identifiers vary in structure and granularity, introducing complexity and high dimensionality into the data. These high-dimensional spaces make it difficult to extract meaningful patterns for prediction. Natural language processing and embedding techniques such as word2vec and BERT offer a promising approach to address these challenges by transforming discrete medical codes into continuous vector representations. These embeddings capture semantic relationships between medical codes, enabling models to understand the similarities and differences between related diagnoses and procedures [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Several studies have demonstrated that embedding methods may improve the performance of predictive models in healthcare by reducing dimensionality while preserving relevant clinical information [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>To address these challenges, our study, investigates the utility of EHR embedding techniques and their performance in the prediction of 30-day hospital readmissions among patients with HF, leveraging the open data available in MIMIC-IV dataset. The 30-day readmission metric is widely used in healthcare policy and quality assessment. In the United States, the Center of Medicare and Medicaid Services includes this metric in the Hospital Readmissions Reduction Program, which penalizes hospital excess readmissions [<xref ref-type="bibr" rid="ref8">8</xref>]. As such, accurate prediction of 30-day readmissions risk has direct implications for hospital reimbursement and patient care planning.</p><p>We compared two embedding approaches: (1) <italic>word2vec</italic> embeddings trained on terminology codes and Unified Medical Language System (UMLS) Concept Unique Identifier (CUIs) and (2) BERT embeddings derived from concept descriptors. We hypothesize that embeddings derived from structured medical codes capture patient-level clinical patterns more effectively than embeddings generated from descriptive text. This is because code-based embeddings reflect co-occurrence and usage patterns in real clinical settings, while text-based embeddings may lack the specificity and contextual relevance needed for structured prediction tasks.</p><p>Word2vec is a widely used embedding technique used to represent words (or codes) in a vector space, trained using shallow neural networks on text corpora [<xref ref-type="bibr" rid="ref9">9</xref>], In the clinical domain, word2vec has been successfully used to embed medical concepts from EHR data [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. In contrast, BERT is a transformer-based language model that generates contextualized embeddings by capturing the position and surrounding context of each word [<xref ref-type="bibr" rid="ref13">13</xref>]. BERT and its biomedical adaptations such as BioBERT and BioClinicalBERT have shown promise for a range of tasks in biomedical and clinical domains [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Among these, BioBERT is pretrained on PubMed abstracts, while BioClinicalBERT extends it with clinical narratives from MIMIC. Lee et al [<xref ref-type="bibr" rid="ref6">6</xref>] demonstrated that textual descriptions of ICD codes can be effectively used for semantic-preserving vector representations.</p><p>While BERT provides contextualized embeddings that excel in natural language understanding, its effectiveness in representing brief and structured medical concept descriptions remain uncertain. In contrast, word2vec, trained on real-world sequences of medical codes, may better capture clinically relevant co-occurrence patterns. In this study, we aim to empirically evaluate whether code-based embeddings (eg, word2vec on medical codes) outperform text-based embeddings (eg, BERT on concept descriptions) in predicting 30-day hospital readmissions, given the structured nature of EHR data.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study design</title><p>For this study, we used Medical Information Mart for Intensive Care (MIMIC)-IV version 2.2, a publicly available dataset released in January 2023. The MIMIC-IV dataset contains comprehensive, de-identified EHR data for patients admitted to the Beth Israel Deaconess Medical Center between 2008 and 2019. The dataset includes 299,712 patients and 431,231 hospital admissions [<xref ref-type="bibr" rid="ref18">18</xref>]. Diagnoses are coded using the ICD, with both 9th and 10th Clinical Modification (CM) revisions. Procedures are recorded using both <italic>ICD</italic>-9 and <italic>ICD</italic>-10 Procedure Coding System codes. Medications are recorded with the National Drug Code (NDC) along with the Drug name, route, strength, etc. To develop predictive models for HF readmission, we used data from the hospital module capturing demographic information, diagnosis, medication and procedure codes.</p><p>We identified patients with HF using the least restrictive phenotyping method, which defines HF-positive cases as any patient with at least one relevant <italic>ICD</italic>-9 or <italic>ICD</italic>-10 code for HF [<xref ref-type="bibr" rid="ref19">19</xref>]. We included HF-positive patients only for this study. 30-day readmission was defined as any subsequent admission occurring within 30 days of discharge, regardless of the reason for readmission. Patients were excluded from the cohort if they died (n=1475), or the length of stay was less than one day (n=1740) at the indexed admission. The exclusion criteria were applied because patients who died during the index admission were not at risk for readmission, and the definition of &#x2018;inpatient&#x2019; becomes ambiguous for patients with hospital stay of less than one day. Admission occurring after the index admission were also excluded, as our objective was to predict readmission following the index HF hospitalization.</p><p>Data preprocessing was structured into three main components: admissions, clinical data extraction, and embedding preparation:</p><list list-type="order"><list-item><p>Admissions: Admissions were categorized as indexed, historic, and future admissions. Indexed admissions were defined as the first hospitalization with a diagnosis of HF. All the admissions prior to indexed admissions were classified historic admission and any subsequent hospitalizations as future admissions. We only developed models for predicting readmission after the index admissions. We included age at admission, length of stay and gender from indexed admissions as features.</p></list-item><list-item><p>Clinical Data extraction: We extracted diagnoses, procedures, and medications from structured data for selected cohorts. These were split into current if date of record corresponds to the index admission and historic if it precedes the indexed admissions. We applied different encoding or embedding methods to these extracted data. We applied one-hot encoding for baseline. The embeddings approaches applied are described below. The current and historic data are combined with indexed admission features to create the final features sets (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p></list-item><list-item><p>Embedding: We applied three embedding approaches to the feature matrix to capture the semantic representations in the data, to output a new feature matrix.</p><list list-type="alpha-lower"><list-item><p>Word2Vec on Terminology Codes: ICD codes were concatenated with their version (ICD-9 or ICD-10) to form unique identifiers that account for overlapping codes between the two versions. For medications, we used NDC codes, excluding drugs that lacked NDC identifiers. Each unique code (diagnosis, procedure, or medication) was treated as a word, and the list of codes for each patient encounter was considered as a document. The word2vec model was trained using the skip-gram with negative sampling (SGNS) algorithm. The subset of training dataset was used to finetune parameters for word2vec (please refer to Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and <xref ref-type="fig" rid="figure1">Figure 1</xref>). Based on the finetuning experiments, we selected the context window size of 10, resulting in 200-dimensional embedding vectors. Separate <italic>word2Vec</italic> models were trained for diagnoses, procedures, and medications.</p></list-item><list-item><p>Word2Vec on UMLS CUIs: To map ICD and procedure codes to UMLS CUIs, we implemented a hybrid code and string-matching approach. First, codes were matched directly to UMLS using a code-matching algorithm. For unmapped code, we truncated the first three characters and attempted matching again. If multiple CUIs were returned, we selected the most generic code. For Medications, NDC codes were matched and for unmapped codes, we used a string search from the drug name. These mapped CUIs were then treated as &#x2018;word&#x2019; and trained in the same manner as recorded medical codes using Word2Vec.</p></list-item><list-item><p>BERT on Descriptive Text: For each medical code, we utilized its long-form descriptor to generate embeddings using pre-trained BERT models. We used BioClinicalBERT, which is specialized for clinical texts, to generate context-aware embeddings from the long descriptions provided in the MIMIC-IV datasets. For medication, as long descriptions were not available, we created one by concatenating multiple columns (drug, prod_strength, dose_val_rx, dose_unit_rx, form_val_disp, form_unit_disp and route) from the prescriptions table for example - &#x201C;Docusate Sodium 100 mg Tablet 100 mg 1 Tab 2.0 Oral.&#x201D; This approach captures semantic meaning and context-specific nuances that are often lost when only the codes are used.</p></list-item></list></list-item></list><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Graphical summary of the Study. CUI: Concept Unique Identifiers ; MIMIC-IV: Medical Information Mart for Intensive Care - IV (Database).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73020_fig01.png"/></fig><p>We developed multiple machine-learning models to predict 30-day readmissions. The embeddings generated from each approach were used as input features for these models. We trained and evaluated three machine learning algorithm&#x2014;logistic regression, and eXtream gradient-boosting (XGBoost) and artificial neural network (ANN). A feedforward ANN consisting of three fully connected layers with Tanh activation and dropout for regularization with progressively reduced dimensionality and outputs a probability via a final sigmoid activation was implemented. Like <italic>word2vec</italic> parameters finetuning, we run multiple experiments with a combination of different parameters and oversampling of minor classes using Synthetic Minority Oversampling Technique (SMOTE). Model performance was assessed using standard metrics such as Area under the receiver operating characteristic curve (AUROC), and <italic>F</italic><sub><italic>1</italic></sub><italic>-</italic>score. (please refer to supplement file in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for the link to the source code).</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The study used the MIMIC dataset, which contains deidentified health-related data and is publicly available through PhysioNet platform. Access to the dataset requires completion of the &#x201C;Data or Specimen Only Research&#x201D; course and acceptance of the data use agreement. The institutional review board at Mass General Brigham determined that the project does not meet the criteria for human subject research, as the dataset used does not contain any identifiable patient information. Since this study used an open dataset available in the public domain that was published by another research group, informed consent and opt-out procedures were managed by the original investigators. Therefore, no direct consent or interaction with participants was required for this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>A total of 21,031 patients were included in the study with 3933 (19%) experiencing a 30-day readmission. This cohort was randomly split into training with 14721 patients (70%), test with 3155 patients (15%), and validation with 3155 patients (15%) sets (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The average age at indexed admission was 73 years, the mean number of admissions was 2, the mean number of ICD codes used was 25 and the mean length of stay is 8 days. The number of admissions (<italic>P</italic>&#x003C;.001, <italic>t</italic>-statistic=10.83), number of ICD codes (<italic>P</italic>&#x003C;.001, t-statistic=12.71), and length of stay (<italic>P</italic>&#x003C;.001, <italic>t</italic>-statistic=10.41) are higher among patients with 30-day readmission, and there is no significant difference in age at index admission (<italic>P</italic>=.40, <italic>t</italic>-statistic=&#x2212;0.851) and the number of male versus female (<italic>P</italic>=.63, odds ratio =1.017). This is true in all splits of datasets as well (<xref ref-type="table" rid="table1">Table 1</xref>). The rate of 30-day readmission was 19% in the Cohort, 19% in the Train dataset, 17% in the test set, and 19% in the validate set.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Flow diagram study cohort selection. *The total number of unique excluded patients accounts for 86 individuals who met both exclusion criteria.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73020_fig02.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Cohort characteristics (N=21,031).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variables</td><td align="left" valign="bottom">Total</td><td align="left" valign="bottom">Negative</td><td align="left" valign="bottom">Positive</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Sex, n</td><td align="left" valign="top">.63<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;Female</td><td align="left" valign="top">9971</td><td align="left" valign="top">8120</td><td align="left" valign="top">1851</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Male</td><td align="left" valign="top">11,060</td><td align="left" valign="top">8978</td><td align="left" valign="top">2082</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Admissions, mean, (SD)</td><td align="left" valign="top">2.07 (2.88)</td><td align="left" valign="top">1.98 (2.44)</td><td align="left" valign="top">2.52 (4.28)</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;ICD codes, mean (SD)</td><td align="left" valign="top">24.59 (18.12)</td><td align="left" valign="top">23.83 (16.93)</td><td align="left" valign="top">27.89 (22.25)</td><td align="left" valign="top">.001<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;Length of stay, mean (SD)</td><td align="left" valign="top">7.83 (8.57)</td><td align="left" valign="top">7.53 (8.04)</td><td align="left" valign="top">9.11 (10.48)</td><td align="left" valign="top">.001<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;Age at index admission, mean (SD)</td><td align="left" valign="top">73.36 (13.55)</td><td align="left" valign="top">73.39 (13.55)</td><td align="left" valign="top">73.2 (13.56)</td><td align="left" valign="top">.40<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;Total</td><td align="left" valign="top">21,031</td><td align="left" valign="top">17,098</td><td align="left" valign="top">3933</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup><italic>&#x03C7;</italic><sup>2</sup> test</p></fn><fn id="table1fn2"><p><sup>b</sup> <italic>t</italic> test</p></fn><fn id="table1fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>We used one-hot encoding as baselines, which yielded AUROC scores of 0.54 and <italic>F</italic><sub>1</sub>-scores ranging from 0.24 to 0.26 across validation and test sets. The embedding-based approaches consistently outperformed the baseline across all models. Logistic regression, XGBoost, and ANN models achieved AUROCs ranging from 0.59 to 0.64, 0.59 to 0.65 and 0.59 to 0.64, respectively. Among the embedding strategies, BERT embeddings from descriptive text performed worst, while the best AUROC (0.65) was achieved using <italic>word2vec</italic> on CUIs with XGBoost. F1 scores varied modestly, ranging from 0.32&#x2010;0.37 (validation) and 0.29&#x2010;0.34(test) (<xref ref-type="table" rid="table2">Table 2</xref>). The best logistic regression results were obtained using L2 regularization with C of 0.01 for both <italic>word2vec</italic> and BERT embeddings. For XGBoost, optimal performance was achieved with max depth of 3 and learning rate of 0.01. For ANN models, a learning rate of 0.001, dropout rate of 0.3, and hidden size of 384 yield the best outcomes across most experiments. Logistic regression and XGBoost performed better without oversampling, whereas the ANN showed improved performance when training set was oversampled to address class imbalance (please refer to Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Best results of different experiments by model.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Experiments and models</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> validation</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub> validation</td><td align="left" valign="bottom">AUROC test</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub> test</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Baseline</td></tr><tr><td align="left" valign="top">&#x2003;Logistic regression</td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.26</td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.25</td></tr><tr><td align="left" valign="top">&#x2003;XGBoost<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.25</td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.25</td></tr><tr><td align="left" valign="top">&#x2003;ANN<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.24</td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.24</td></tr><tr><td align="left" valign="top" colspan="5">Word2vec on terminology code</td></tr><tr><td align="left" valign="top">&#x2003;Logistic regression</td><td align="char" char="." valign="top">0.64</td><td align="char" char="." valign="top">0.37</td><td align="char" char="." valign="top">0.64</td><td align="char" char="." valign="top">0.34</td></tr><tr><td align="left" valign="top">&#x2003;XGBoost</td><td align="char" char="." valign="top">0.65<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="char" char="." valign="top">0.38<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="char" char="." valign="top">0.65<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="char" char="." valign="top">0.33</td></tr><tr><td align="left" valign="top">&#x2003;ANN</td><td align="char" char="." valign="top">0.64</td><td align="char" char="." valign="top">0.37</td><td align="char" char="." valign="top">0.64</td><td align="char" char="." valign="top">0.34</td></tr><tr><td align="left" valign="top" colspan="5">Word2vec on CUIs<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;Logistic regression</td><td align="char" char="." valign="top">0.63</td><td align="char" char="." valign="top">0.35</td><td align="char" char="." valign="top">0.62</td><td align="char" char="." valign="top">0.32</td></tr><tr><td align="left" valign="top">&#x2003;XGBoost</td><td align="char" char="." valign="top">0.63</td><td align="char" char="." valign="top">0.36</td><td align="char" char="." valign="top">0.65<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="char" char="." valign="top">0.35<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;ANN</td><td align="char" char="." valign="top">0.60</td><td align="char" char="." valign="top">0.34</td><td align="char" char="." valign="top">0.61</td><td align="char" char="." valign="top">0.32</td></tr><tr><td align="left" valign="top" colspan="5">BERT on descriptive text</td></tr><tr><td align="left" valign="top">&#x2003;Logistic regression</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.32</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.30</td></tr><tr><td align="left" valign="top">&#x2003;XGBoost</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.32</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.29</td></tr><tr><td align="left" valign="top">&#x2003;ANN</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.33</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.31</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table2fn2"><p><sup>b</sup>XGBoost: eXtreamextreme gradient-boosting.</p></fn><fn id="table2fn3"><p><sup>c</sup>Oversampling of training dataset using synthetic minority oversampling technique.</p></fn><fn id="table2fn4"><p><sup>d</sup>ANN: artificial neural network.</p></fn><fn id="table2fn5"><p><sup>e</sup>best score.</p></fn><fn id="table2fn6"><p><sup>f</sup>CUI: Concept Unique Identifier.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The study presents the systematic comparison of embedding techniques for predicting 30-day heart failure readmission using EHR data. Our analysis of 21,031 heart failure admission from MIMIC-IV dataset demonstrates that the use of <italic>word2vec</italic> embedding trained on the patient data achieved superior performance compared to traditional one-hot-encoding and pretrained language model (BERT).</p><p>The <italic>word2vec</italic> on CUIs with XGBoost had AUROC of 0.65, which is substantial improvement of 0.11 over the one-hot-encoding baseline, and modest improvement of 0.06 over pre-trained BioClincial_BERT model. The corresponding F<sub>1</sub>-score improvement of 0.10 and 0.04, respectively demonstrates enhanced performance of the word2vec approach.</p><p>We investigated the performance gains for word-2-vec using dimensionality reduction analysis. A t-distributed stochastic neighbor embedding (<xref ref-type="fig" rid="figure3">Figure 3</xref>) revealed that word2vec embeddings facilitated more coherent clustering of diagnostically related codes, implying superior capture of clinically relevant semantic relationships.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>T &#x2013; SNE plot of diagnosis code embeddings with encoded categories.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73020_fig03.png"/></fig><p>Overall our findings align with literature in data-driven approaches for readmission prediction [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>], while revealing important improvements that can be obtained by using embeddings. However our results were inferior compared to previous studies that utilized expert-crafted features, where the AUROC scores exceeding 0.8 [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. For instance, Pishgar et al [<xref ref-type="bibr" rid="ref22">22</xref>] reported AUROC of 0.70 using features selected as the event logs with ANNs, and Ben-Assuli et al [<xref ref-type="bibr" rid="ref23">23</xref>] achieved 0.88 using mix of ML algorithm and expert-selected features with XGBoost. This performance differential highlights the trade-off between automated feature engineering and use of human-experts for feature selection. This is consistent with findings across other clinical areas, where domain expertise has been shown to significantly enhance predictive performance.</p><p>Notably, our results suggest that dataset-specific word2vec embeddings may outperform sophisticated pre-trained language models. This is likely because approaches that directly model co-occurrence patterns within the target dataset may capture more task-specific signal as compared to models trained on broader textual corpora.</p><p>Overall our study fills a significant gap in the literature by offering a systematic evaluation of embeddings strategies, thereby providing evidence-based guidance for selecting the appropriate embedding method for clinical prediction tasks.</p><p>The study has several limitations. Patient encounters were modeled as static collection of medical codes without considering temporal patterns. The study did not evaluate the portability of the models as the dataset is limited to a single institution. Our evaluation focused on traditional machine learning approaches (logistic regression, XGBoost, and ANN), which may not fully exploit the representational capacity of embedding methods. Future research should explore temporal sequence modeling with transformer-based architectures, graph-based representation learning, and multimodal data to capture richer clinical context. Validation across multiple institutions and diverse population is essential to demonstrate portability.</p></sec><sec id="s4-2"><title>Conclusion</title><p>In conclusion, this study demonstrated that embedding techniques, particularly word2vec trained on study dataset is a viable automated approach for heart failure readmission prediction. Further studies are required to investigate if task-specific representation learning may be more effective than use of general-purpose models for specific clinical applications.</p></sec></sec></body><back><ack><p>We extend our thanks to the developers of open-source tools and frameworks, including word2vec, BioClinical_BERT which greatly facilitated the embedding generation and analysis in our study.</p></ack><notes><sec><title>Funding</title><p>This project was supported by grants from NHLBI (R01-HL151643) and Amazon Web services.</p></sec></notes><fn-group><fn fn-type="con"><p>KBW and PRS conceptualized the study. PRS designed and performed the experiment, prepared the manuscript. KBW reviewed and supervised experiments and manuscript. AK reviewed experiment design and manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ANN</term><def><p>Artificial neural network</p></def></def-item><def-item><term id="abb2">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb3">CUI</term><def><p>Concept Unique Identifier</p></def></def-item><def-item><term id="abb4">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb5">HF</term><def><p>heart failure</p></def></def-item><def-item><term id="abb6">ICD</term><def><p>International Classification of Diseases</p></def></def-item><def-item><term id="abb7">MIMIC</term><def><p>Medical Information Mart for Intensive Care</p></def></def-item><def-item><term id="abb8">NDC</term><def><p>National Drug Code</p></def></def-item><def-item><term id="abb9">UMLS</term><def><p>Unified Medical Language System</p></def></def-item><def-item><term id="abb10">XGBoost</term><def><p>eXtream gradient-boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savarese</surname><given-names>G</given-names> </name><name name-style="western"><surname>Becher</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Lund</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Seferovic</surname><given-names>P</given-names> </name><name name-style="western"><surname>Rosano</surname><given-names>GMC</given-names> </name><name name-style="western"><surname>Coats</surname><given-names>AJS</given-names> </name></person-group><article-title>Global burden of heart failure: a comprehensive and updated review of epidemiology</article-title><source>Cardiovasc Res</source><year>2023</year><month>01</month><day>18</day><volume>118</volume><issue>17</issue><fpage>3272</fpage><lpage>3287</lpage><pub-id pub-id-type="doi">10.1093/cvr/cvac013</pub-id><pub-id pub-id-type="medline">35150240</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>S</given-names> </name><name name-style="western"><surname>Miranda</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Mamas</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>Sex differences in the etiology and burden of heart failure across country income level: analysis of 204 countries and territories 1990-2019</article-title><source>Eur Heart J Qual Care Clin Outcomes</source><year>2023</year><month>11</month><day>2</day><volume>9</volume><issue>7</issue><fpage>662</fpage><lpage>672</lpage><pub-id pub-id-type="doi">10.1093/ehjqcco/qcac088</pub-id><pub-id pub-id-type="medline">36577147</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lan</surname><given-names>T</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Mortality and readmission rates after heart failure: a systematic review and meta-analysis</article-title><source>TCRM</source><year>2021</year><volume>17</volume><fpage>1307</fpage><lpage>1320</lpage><pub-id pub-id-type="doi">10.2147/TCRM.S340587</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>MY</given-names> </name><name name-style="western"><surname>Son</surname><given-names>YJ</given-names> </name></person-group><article-title>Machine learning-based 30-day readmission prediction models for patients with heart failure: a systematic review</article-title><source>Eur J Cardiovasc Nurs</source><year>2024</year><month>10</month><day>21</day><volume>23</volume><issue>7</issue><fpage>711</fpage><lpage>719</lpage><pub-id pub-id-type="doi">10.1093/eurjcn/zvae031</pub-id><pub-id pub-id-type="medline">38421187</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chanda</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Bai</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Vucetic</surname><given-names>S</given-names> </name></person-group><article-title>Improving medical term embeddings using UMLS Metathesaurus</article-title><source>BMC Med Inform Decis Mak</source><year>2022</year><month>04</month><day>29</day><volume>22</volume><issue>1</issue><fpage>114</fpage><pub-id pub-id-type="doi">10.1186/s12911-022-01850-5</pub-id><pub-id pub-id-type="medline">35488252</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>YC</given-names> </name><name name-style="western"><surname>Jung</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name><etal/></person-group><article-title>ICD2Vec: mathematical representation of diseases</article-title><source>J Biomed Inform</source><year>2023</year><month>05</month><volume>141</volume><fpage>104361</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2023.104361</pub-id><pub-id pub-id-type="medline">37054960</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ru</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Comparison of machine learning algorithms for predicting hospital readmissions and worsening heart failure events in patients with heart failure with reduced ejection fraction: modeling study</article-title><source>JMIR Form Res</source><year>2023</year><month>04</month><day>17</day><volume>7</volume><issue>1</issue><fpage>e41775</fpage><pub-id pub-id-type="doi">10.2196/41775</pub-id><pub-id pub-id-type="medline">37067873</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><article-title>Hospital readmissions reduction program (HRRP)</article-title><source>CMS</source><access-date>2025-04-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cms.gov/medicare/payment/prospective-payment-systems/acute-inpatient-pps/hospital-readmissions-reduction-program-hrrp">https://www.cms.gov/medicare/payment/prospective-payment-systems/acute-inpatient-pps/hospital-readmissions-reduction-program-hrrp</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>J</given-names> </name></person-group><article-title>Efficient estimation of word representations in vector space</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 16, 2013</comment><pub-id pub-id-type="doi">10.48550/arXiv.1301.3781</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Schuetz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Stewart</surname><given-names>WF</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>Medical concept representation learning from electronic health records and its application on heart failure prediction</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 20, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1602.03686</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bai</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chanda</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Egleston</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Vucetic</surname><given-names>S</given-names> </name></person-group><article-title>EHR phenotyping via jointly embedding medical concepts and words into a unified vector space</article-title><source>BMC Med Inform Decis Mak</source><year>2018</year><month>12</month><day>12</day><volume>18</volume><issue>Suppl 4</issue><fpage>123</fpage><pub-id pub-id-type="doi">10.1186/s12911-018-0672-0</pub-id><pub-id pub-id-type="medline">30537974</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Steiger</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kroll</surname><given-names>LE</given-names> </name></person-group><article-title>Patient embeddings from diagnosis codes for health care prediction tasks: Pat2Vec Machine Learning Framework</article-title><source>JMIR AI</source><year>2023</year><month>04</month><day>21</day><volume>2</volume><issue>1</issue><fpage>e40755</fpage><pub-id pub-id-type="doi">10.2196/40755</pub-id><pub-id pub-id-type="medline">38875541</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><comment>Preprint posted online on 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Alsentzer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Boag</surname><given-names>W</given-names> </name></person-group><article-title>Publicly available clinical BERT embeddings</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 6, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1904.03323</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rasmy</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xiang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Tao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhi</surname><given-names>D</given-names> </name></person-group><article-title>Med-BERT: pretrained contextualized embeddings on large-scale structured electronic health records for disease prediction</article-title><source>npj Digit Med</source><year>2021</year><month>05</month><day>20</day><volume>4</volume><issue>1</issue><fpage>1</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1038/s41746-021-00455-y</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Solares</surname><given-names>JRA</given-names> </name><etal/></person-group><article-title>BEHRT: transformer for electronic health records</article-title><source>Sci Rep</source><year>2020</year><volume>10</volume><issue>1</issue><fpage>7155</fpage><pub-id pub-id-type="doi">10.1038/s41598-020-62922-y</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Bulgarelli</surname><given-names>L</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-IV, a freely accessible electronic health record dataset</article-title><source>Sci Data</source><year>2023</year><month>01</month><day>3</day><volume>10</volume><issue>1</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1038/s41597-022-01899-x</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Graham</surname><given-names>J</given-names> </name><name name-style="western"><surname>Iverson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Monteiro</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Applying computable phenotypes within a common data model to identify heart failure patients for an implantable cardiac device registry</article-title><source>IJC Heart &#x0026; Vasculature</source><year>2022</year><month>04</month><volume>39</volume><fpage>100974</fpage><pub-id pub-id-type="doi">10.1016/j.ijcha.2022.100974</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Stansbury</surname><given-names>C</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Predicting 30-day hospital readmissions using artificial neural networks with medical code embedding</article-title><source>PLoS ONE</source><year>2020</year><volume>15</volume><issue>4</issue><fpage>e0221606</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0221606</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Nazyrova</surname><given-names>N</given-names> </name><name name-style="western"><surname>Chahed</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chausalet</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dwek</surname><given-names>M</given-names> </name></person-group><article-title>Leveraging large language models for medical text classification: a hospital readmission prediction case</article-title><conf-name>2024 14th International Conference on Pattern Recognition Systems (ICPRS)</conf-name><conf-date>Jul 15-18, 2024</conf-date><pub-id pub-id-type="doi">10.1109/ICPRS62101.2024.10677826</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pishgar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Theis</surname><given-names>J</given-names> </name><name name-style="western"><surname>Del Rios</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ardati</surname><given-names>A</given-names> </name><name name-style="western"><surname>Anahideh</surname><given-names>H</given-names> </name><name name-style="western"><surname>Darabi</surname><given-names>H</given-names> </name></person-group><article-title>Prediction of unplanned 30-day readmission for ICU patients with heart failure</article-title><source>BMC Med Inform Decis Mak</source><year>2022</year><month>05</month><day>2</day><volume>22</volume><issue>1</issue><fpage>117</fpage><pub-id pub-id-type="doi">10.1186/s12911-022-01857-y</pub-id><pub-id pub-id-type="medline">35501789</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ben-Assuli</surname><given-names>O</given-names> </name><name name-style="western"><surname>Heart</surname><given-names>T</given-names> </name><name name-style="western"><surname>Klempfner</surname><given-names>R</given-names> </name><name name-style="western"><surname>Padman</surname><given-names>R</given-names> </name></person-group><article-title>Human-machine collaboration for feature selection and integration to improve congestive Heart failure risk prediction</article-title><source>Decis Support Syst</source><year>2023</year><month>09</month><volume>172</volume><fpage>113982</fpage><pub-id pub-id-type="doi">10.1016/j.dss.2023.113982</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Fine-tuning the <italic>word2vec</italic> model.</p><media xlink:href="medinform_v13i1e73020_app1.docx" xlink:title="DOCX File, 105 KB"/></supplementary-material></app-group></back></article>