<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e86533</article-id><article-id pub-id-type="doi">10.2196/86533</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Automated ICD-10&#x2013;Anchored Classification of Primary Care Text Data: Development and Evaluation of a Custom Multilabel Classifier</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Haag</surname><given-names>Christina</given-names></name><degrees>Dr. phil.</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Grischott</surname><given-names>Thomas</given-names></name><degrees>PD, Dr med</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Burgstaller</surname><given-names>Jakob M</given-names></name><degrees>Prof Dr med, Dr med dent, PhD, EMBA</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Markun</surname><given-names>Stefan</given-names></name><degrees>PD, Dr med</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Senn</surname><given-names>Oliver</given-names></name><degrees>Prof Dr med</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>von Wyl</surname><given-names>Viktor</given-names></name><degrees>Prof. Dr. sc. nat.</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Epidemiology, Biostatistics and Prevention Institute, University of Zurich</institution><addr-line>Hirschengraben 84</addr-line><addr-line>Zurich</addr-line><country>Switzerland</country></aff><aff id="aff2"><institution>Institute for Implementation Science in Health Care, University of Zurich</institution><addr-line>Universit&#x00E4;tstrasse 84</addr-line><addr-line>Zurich</addr-line><country>Switzerland</country></aff><aff id="aff3"><institution>Institute of Primary Care, University Hospital Zurich, University of Zurich</institution><addr-line>Zurich</addr-line><country>Switzerland</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Triep</surname><given-names>Karen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Elbattah</surname><given-names>Mahmoud</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Viktor von Wyl, Prof. Dr. sc. nat., Epidemiology, Biostatistics and Prevention Institute, University of Zurich, Hirschengraben 84, Zurich, 8001, Switzerland, +41 44 63 46380; <email>viktor.vonwyl@uzh.ch</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>6</day><month>4</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e86533</elocation-id><history><date date-type="received"><day>26</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>18</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>06</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Christina Haag, Thomas Grischott, Jakob M Burgstaller, Stefan Markun, Oliver Senn, Viktor von Wyl. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 6.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e86533"/><abstract><sec><title>Background</title><p>Electronic medical records are a vast and valuable source of information, useful for tasks such as estimating disease prevalence. However, in routine primary care, much of this information is in free-text format rather than in a structured form and, therefore, not readily amenable to analysis. Manual coding of this textual data is both time-consuming and resource-intensive, making it impractical for large datasets. Although powerful open-source language models offer new opportunities for automated coding, their use on short heterogeneous primary care notes, particularly in German-language settings, remains insufficiently studied.</p></sec><sec><title>Objective</title><p>By providing hands-on guidance for applied health researchers, this study aims to demonstrate the effective and accurate automatic classification of free-text notes using a language model fine-tuned for automated <italic>International Statistical Classification of Diseases, Tenth Revision</italic> (<italic>ICD-10</italic>) coding.</p></sec><sec sec-type="methods"><title>Methods</title><p>Building on the extensive Family Medicine Research Using Electronic Medical Records (FIRE) routine database from the Institute of Primary Care at the University Hospital Zurich and the University of Zurich, we trained a large language model&#x2013;based multilabel classifier on a dataset of 38,728 free-text notes, which had been manually categorized into 47 classes using specific <italic>ICD-10</italic> codes and code ranges or nondiagnostic/ad hoc labels (eg, &#x201C;unclear diagnosis,&#x201D; &#x201C;status post&#x201D;). We stratified the labeled data into training (70%), validation (15%), and posttraining test (15%) sets, ensuring similar label distributions across these sets. Using the Transformers Python library, we trained the model over 10 epochs and evaluated it on the posttraining test dataset.</p></sec><sec sec-type="results"><title>Results</title><p>Across 48 classes, the FIRE classifier achieved strong performance on the held-out posttraining set, with <italic>F</italic><sub>1</sub>-scores of 0.85 (micro, overall across all predictions), 0.86 (macro, mean of per-class scores treating classes equally), and 0.84 (weighted, per-class scores weighted by class frequency).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study demonstrates steps for training open-source large language models and highlights the potential to streamline and scale the extraction of diagnostic information for practical applications. Our model can be robustly deployed, for example, for prescreening and labeling of free-text information, thus potentially reducing the burden of repetitive and error-prone manual handling.</p></sec></abstract><kwd-group><kwd>electronic medical records</kwd><kwd>free-text classification</kwd><kwd>ICD-10 coding</kwd><kwd>language models</kwd><kwd>natural language processing</kwd><kwd>automated coding</kwd><kwd>primary care</kwd><kwd>clinical text mining</kwd><kwd>health informatics</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Electronic medical records (EMRs) contain rich diagnostic information, but in routine primary care, much of this information is documented in short unstructured free-text format rather than as structured records. As a result, large-scale analysis and reuse of diagnostic data remains challenging, since manual coding of free-text notes is both time-consuming and resource-intensive, making it impractical on a large scale. Therefore, automated assignment of diagnostic codes, in particular from the <italic>International Statistical Classification of Diseases, Tenth Revision</italic> (<italic>ICD-10</italic>), has long been of practical interest for research and health care.</p><p>The emergence of powerful open-source language models offers new opportunities to automate text-based diagnosis extraction for research or health care. Transformer-based models like bidirectional encoder representations from transformers (BERT) and generative pretrained transformers have gained traction for <italic>ICD-10</italic> coding of clinical texts, achieving strong performance on English language EMR datasets [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. However, research on the classification of text data into <italic>ICD-10</italic>&#x2013;anchored classes has been conducted mainly in English [<xref ref-type="bibr" rid="ref1">1</xref>]. While recent studies demonstrate that domain-adapted transformer models can perform well on non-English clinical texts&#x2014;including Portuguese [<xref ref-type="bibr" rid="ref3">3</xref>], French [<xref ref-type="bibr" rid="ref4">4</xref>], and Spanish [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]&#x2014;the availability of data and pretrained models varies widely across languages. For German clinical texts, resources remain limited, and existing work has largely focused on well-structured, high-quality documents such as discharge summaries or surgery reports. In routine primary care documentation in particular, notes are often short, loosely structured, and heterogeneous. Diagnostic coding may also combine <italic>ICD-10</italic> codes with administrative or primary care&#x2013;specific systems such as the <italic>International Classification of Primary Care</italic>. These characteristics limit the direct reuse of existing models and underscore the need for approaches that are robust to real-world data and adaptable to local coding practices.</p><p>Taken together, these factors&#x2014;a scarcity of annotated German clinical text datasets from routine care, the predominance of English language research, and the short heterogeneous nature of primary care notes&#x2014;leave little guidance on how to adapt open-source large language models (LLMs) for routine use in German primary care. To address this gap, our study describes the implementation of a flexible, German-language, multilabel LLM classifier for real-world free-text notes in primary care. Although focused on a German-language dataset, the methodological approach may serve as a blueprint in settings beyond English.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Source: The Family Medicine Research Using Electronic Medical Records Database</title><p>The Swiss Family Medicine Research Using Electronic Medical Records (FIRE) database was initiated in 2009 and has been designed as a basic knowledge base for primary care research through the use of EMRs [<xref ref-type="bibr" rid="ref7">7</xref>]. So far, the FIRE project has enrolled around 450 general practitioners throughout Switzerland who are willing to contribute their patients&#x2019; EMRs to the database. General practitioners in the FIRE project routinely document patient encounters&#x2014;in clinics, at home, or by phone&#x2014;and records are regularly exported to the FIRE database. The reasons for physician encounters are encoded using the <italic>International Classification of Primary Care, 2nd edition</italic>, albeit not systematically by all participating clinicians. Instead, many participating physicians often include unstructured free-text notes, particularly in the health problems and diagnoses fields, where diagnostic and health care use information is recorded.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The local ethics committee of the Canton of Zurich waived approval for research with the FIRE database because patient data is fully anonymized and, therefore, outside the scope of the Swiss Human Research Act (BASEC-Nr. Req2017&#x2013; 00797). The study was conducted in accordance with the Declaration of Helsinki and Good Clinical Practice guidelines.</p></sec><sec id="s2-3"><title>Diagnosis and Problem List Free-Text Notes</title><p>This study used manually annotated free-text notes from the diagnosis and problems lists in Swiss general practitioners&#x2019; EMRs. In Swiss primary care, these lists function as longitudinal note-based records with brief entries summarizing patients&#x2019; current and past health problems (eg, &#x201C;hypertension,&#x201D; &#x201C;low back pain&#x201D;) as well as related events or observations. Each note represents a single, concise observation that persists across consultations, as structured diagnostic coding is not routinely used in Swiss outpatient care.</p></sec><sec id="s2-4"><title>Training an LLM-Based Classifier for Automated Text Classification</title><sec id="s2-4-1"><title>Overview</title><p>All analyses and visualizations were performed in Python (version 3.11; Python Software Foundation), using the Jupyter Notebook environment (version 2024.3.2; Project Jupyter). Individual analysis steps and specific Python libraries are detailed in the following sections. The overall procedure is visualized in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The Jupyter Notebook containing the Python code and complete model training output is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The figure visualizes the modeling procedure to build the multilabel classifier based on the FIRE dataset. BERT: bidirectional encoder representations from transformers; FIRE: Family Medicine Research Using Electronic Medical Records.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86533_fig01.png"/></fig></sec><sec id="s2-4-2"><title>Step 1: Ground Truth Data for Classifier Training</title><p>We worked with a total of 53,481 free-text notes, incorporating data from Walln&#x00F6;fer et al [<xref ref-type="bibr" rid="ref8">8</xref>] and additional notes annotated thereafter (n=26,501, 49.6%). Walln&#x00F6;fer et al [<xref ref-type="bibr" rid="ref8">8</xref>] developed an <italic>ICD-10</italic>&#x2013;anchored framework comprising 105 diagnostic codes, including both individual <italic>ICD-10</italic> codes and code ranges, as well as a nonspecific &#x201C;no diagnosis&#x201D; category. For the present study, a subset of these codes was consolidated into broader groups by combining related diagnostic categories into higher-level ICD classifications, resulting in 44 categories. In addition, three less specific encounter codes not included in the original framework were introduced&#x2014;&#x201C;suspected diagnosis,&#x201D; &#x201C;unclear,&#x201D; and &#x201C;status post&#x201D;&#x2014;bringing the total to 47 categories. Reported interrater reliability for the original codes used in this study&#x2014;either directly or in aggregated form&#x2014;ranged from moderate to almost perfect (&#x03BA;=0.53-0.98); however, these values are not directly transferable to the aggregated categories. For the Walln&#x00F6;fer et al [<xref ref-type="bibr" rid="ref8">8</xref>] dataset (n=26,980, 50.4%), two physician raters independently coded the notes (full details are reported in Walln&#x00F6;fer et al [<xref ref-type="bibr" rid="ref8">8</xref>]). The remaining 26,501 (49.6%) notes were annotated by a single trained physician.</p></sec><sec id="s2-4-3"><title>Step 2: Data Preprocessing</title><p>To create a more balanced dataset, we reduced the number of entries for the most heavily overrepresented <italic>ICD-10</italic> code range Z00-Z99 &#x201C;factors influencing health status and contact with health services,&#x201D; which occurred in 42.5% (n=22,714) of all free-text notes, either alone or in combination with other labels. Specifically, we removed all 14,316 free-text entries assigned exclusively to this code range, reducing the overall dataset to 38,728 free-text notes. As a result, this code range&#x2019;s representation was reduced to 8226 (21.2%), reflecting only those entries where Z00-Z99 appeared alongside other labels.</p></sec><sec id="s2-4-4"><title>Step 3: Subdividing the Data Into Training, Validation, and Posttraining Test Data</title><p>We then divided the preprocessed dataset (n=38,728) into three parts: 70% (n=27,108) for training, 15% (n=5810) for validation during training, and 15% (n=5810) for final testing. To ensure each class label was proportionally represented in all three sets, we used a stratified splitting process based on the final list of 47 labels, using the <italic>iterstrat</italic> Python package and its MultilabelStratifiedShuffleSplit class for multilabel stratified splitting. In other words, if a particular class made up, for instance, 10% of the entire dataset, it also contained roughly 10% of the training set, 10% of the validation set, and 10% of the posttraining test set.</p></sec><sec id="s2-4-5"><title>Step 4: Model Training</title><p>To build our classifier, we used a German BERT LLM (dbmdz/bert-base-german-uncased) from the Hugging Face model hub [<xref ref-type="bibr" rid="ref9">9</xref>] and replaced its top layer with a classification head to identify the 47 classes in unstructured medical free text. Tokenization was performed using the pretrained tokenizer associated with GermanBERT, a German-language&#x2013;specific model; no multilingual or custom tokenization was applied (see executed Python script in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The model&#x2019;s maximum input length was 512 tokens. No truncation was applied during training or inference, and the model always processed the complete note (median 15 tokens, mean 18 tokens, maximum 111 tokens). Since a single free-text note may belong to multiple categories, we used a multilabel classification approach to allow the assignment of several labels to each note. For model training, we used the Trainer class from the Transformers [<xref ref-type="bibr" rid="ref10">10</xref>] Python library, which streamlines model training and metric calculation. We set the decision threshold for the binary predictions (true/false) at 0.5 per class.</p></sec><sec id="s2-4-6"><title>Step 5: Classifier Evaluation During Training</title><p>We monitored evaluation metrics across all 10 epochs throughout training. During each epoch, the model iterated over all training batches, updating its weights via backpropagation based on the binary cross-entropy training loss.</p><sec id="s2-4-6-1"><title>Overall Model Evaluation</title><p>After each epoch, we evaluated model quality on the validation set. We first recorded training loss&#x2014;the average discrepancy between predictions and ground truth on the training data; lower is better. We computed the same metric on the validation set (validation loss) to assess generalization: if validation loss falls and then rises while training loss keeps falling, the model is starting to memorize rather than generalize. We also tracked the microaveraged <italic>F</italic><sub>1</sub>-score (the harmonic mean of precision and recall), which gives more weight to more frequent classes. In addition, we measured the area under the receiver operating characteristic curve, summarizing how well the model separates positives from negatives across all thresholds (values near 1 indicate excellent separation; values near 0.5 indicate chance performance). Finally, because examples can have multiple labels, we reported subset (exact match) accuracy on the validation set&#x2014;the fraction of examples for which every label is predicted correctly.</p></sec><sec id="s2-4-6-2"><title>Per-Class Evaluation</title><p>Because each free-text note could belong to multiple classes, we examined model performance for each label individually using precision, recall, and <italic>F</italic><sub>1</sub>-score metrics. These metrics are particularly informative for multilabel classification because they capture how well the model identifies positive cases (recall), avoids false positives (precision), and balances both aspects (<italic>F</italic><sub>1</sub>-score). Accuracy was not used on a per-class basis, as it is dominated by true negatives in imbalanced datasets and may, therefore, provide a misleading indication of classifier performance.</p></sec></sec></sec><sec id="s2-5"><title>Step 6: Classifier Evaluation on the Posttraining Test Data</title><p>The classifier was subsequently evaluated on the held-out 15% posttraining test set, which had not been used during model training or validation.</p><sec id="s2-5-1"><title>Overall Model Evaluation</title><p>To provide an overall measure of performance across all diagnostic labels, we calculated the micro-, macro-, and weighted averages of precision, recall, and <italic>F</italic><sub>1</sub>-score. The microaveraged metrics combine all true and false positives and negatives across labels, capturing the model&#x2019;s overall ability to classify instances correctly. The macroaveraged scores assign equal weight to each label, offering a balanced view that is not influenced by class frequency. In contrast, the weighted average metrics account for label prevalence, summarizing performance in proportion to how often each label occurs in the data.</p></sec><sec id="s2-5-2"><title>Per-Class Evaluation</title><p>To complement the aggregate measures, the classifier&#x2019;s performance was also examined for each diagnostic category in the posttraining test set. For every label, precision, recall, and <italic>F</italic><sub>1</sub>-score were computed based on the corresponding ground truth and predicted binary vectors. These per-class metrics highlight how accurately the model distinguishes between individual diagnostic groups, allowing detailed insight into which classes were predicted most reliably and which proved more challenging.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>On average, each free-text note was assigned to 1.27 different classes, with the number of classes per free-text note ranging from 1 to 4. The class with the highest number of free-text notes was the umbrella class &#x201C;factors influencing health status and contact with health services&#x201D; (Z00-Z99; n=8226). Other classes with particularly high counts included &#x201C;status post diagnosis&#x201D; (n=3655) and &#x201C;diseases of the musculoskeletal system and connective tissue&#x201D; (M00-M99; n=2883). Conversely, classes such as &#x201C;dementia&#x201D; (F00-F03; n=97), &#x201C;heart failure&#x201D; (I50; n=116), and &#x201C;acute upper respiratory infections&#x201D; (J00-J06; n=163) were among those with the lowest number of free-text notes.</p></sec><sec id="s3-2"><title>Monitoring Model Metrics</title><p>All model metrics during training and for the test data are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-3"><title>Model Metrics During Training</title><p>Over the course of 10 training epochs, overall model performance reached a plateau, with the highest <italic>F</italic><sub>1</sub>-score reaching 0.85 achieved at epoch 10, indicating convergence of model learning. Overall model accuracy at this point was 0.75, and the area under the receiver operating characteristic curve equaled 0.92. The trained FIRE classifier demonstrated robust performance across the majority of classes; specifically, precision during training averaged 0.88 (SD 0.08), recall averaged 0.84 (SD 0.14), and the <italic>F</italic><sub>1</sub>-score averaged 0.86 (SD 0.11).</p></sec><sec id="s3-4"><title>Performance on Posttraining Test Data</title><p>Across 47 classes, the FIRE classifier achieved strong performance on the held-out posttraining set, with <italic>F</italic><sub>1</sub>-scores of 0.85 (micro, overall across all predictions), 0.86 (macro, mean of per-class scores treating classes equally), and 0.84 (weighted, per-class scores weighted by class frequency). The classifier achieved the highest performance for &#x201C;Vitamin D deficiency&#x201D; (E55) with an <italic>F</italic><sub>1</sub>-score of 1.0, followed closely by &#x201C;Disorders of lipoprotein metabolism and other lipidaemias&#x201D; (E78), &#x201C;Essential (primary) hypertension&#x201D; (I10), and &#x201C;Diseases of liver&#x201D; (K70-K87), all exceeding 0.97. Strong results were also observed for a range of chronic and metabolic conditions such as obesity (E65-E68), &#x201C;Asthma&#x201D; (J45), and &#x201C;Mood disorders&#x201D; (F30-F39), with <italic>F</italic><sub>1</sub>-scores above 0.95. In contrast, performance declined for certain classes with fewer samples or greater diagnostic overlap, such as &#x201C;Unclear diagnosis&#x201D; (<italic>F</italic><sub>1</sub>-score 0.54) and &#x201C;Congenital malformations, deformations and chromosomal abnormalities&#x201D; (Q00-Q99; <italic>F</italic><sub>1</sub>-score 0.52). As displayed in <xref ref-type="fig" rid="figure2">Figure 2</xref>, <italic>F</italic><sub>1</sub>-scores for the posttraining test data were generally high, with variation across the different labels.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Performance of the Family Medicine Research Using Electronic Medical Records (FIRE) classifier on the posttraining test dataset. The bar chart displays the <italic>F</italic><sub>1</sub>-scores (x-axis) for each diagnostic class (y-axis) predicted by the FIRE classifier on the 15% test posttraining dataset, unseen during training. Larger bars indicate better classification performance.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86533_fig02.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>Extracting medical diagnoses or encounters from free-text EMRs or research data still poses critical practice challenges. By using a labeled dataset of ~53,000 records from a nationwide general practice research database, our study developed a classifier to predict appropriate codes in German free-text notes using a locally adapted diagnosis and medical encounter classification system. As such, our study offers insights into not only the challenges but also the potential rewards of creating locally adapted multiclassifier systems for medical free-text notes. Our multilabel classifier showed robust performance in classifying 47 different diagnostic and medical encounter categories during training and on unseen posttraining test data, with varying amounts of available labeled training data across categories. Our multilabel classifier was able to predict around half of the included diagnostic and medical encounter categories with high <italic>F</italic><sub>1</sub>-scores of 0.8 or greater. These findings showcase the potential usefulness of our classifier approach for systematic use in research.</p><p>Therefore, our study demonstrates the feasibility for health care providers to develop locally adapted text classification models designed to meet their unique data requirements and integration needs within operational workflows. We encountered challenges previously discussed in the literature [<xref ref-type="bibr" rid="ref11">11</xref>], such as predicting a large number of classes simultaneously (a large label space) and dealing with an unbalanced label distribution, in particular with the predominant class &#x201C;factors influencing health status and contact with health services.&#x201D; Despite preprocessing to mitigate its overrepresentation by removing free-text notes labeled with this class label, its high prevalence persisted due to co-occurrence with other classes, which we did not wish to thin out. This feature, while not ideal, reflects the real-world data scenario and underpins the relevance of our findings. Importantly, despite these real-world challenges, our classifier maintained commendable performance on both common and rare classes, as well as on new, unseen data, providing valuable insights for health care providers interested in developing custom models amid similar data challenges.</p><p>Determining the optimal sample size for training multilabel classifiers remains a topic of debate, as there is no straightforward method such as the power analysis used in hypothesis testing. Our study shows that even for low-frequency classes (ie, with fewer than 100 labeled free-text notes available) such as hypertensive diseases, model performance can still be high. For health care providers looking to develop their own classifiers, our results suggest that the required sample size for each category could be fewer than 100 text samples when training a multilabel classifier of a similar size. However, it&#x2019;s important to note that the required sample size varies depending on how well the classes are defined by specific terms, such as &#x201C;dementia,&#x201D; versus more ambiguous terms, such as &#x201C;suspected diagnosis.&#x201D; Generally, our manual error investigation suggested that the quality of data labeling is of crucial importance.</p><p>There has been ongoing discussion about shifting toward prompt-based classification and moving beyond manually curated training data by leveraging industry-developed, general-purpose LLMs such as Google&#x2019;s Gemini or OpenAI&#x2019;s ChatGPT. However, a key drawback of currently available powerful general-purpose LLMs, such as OpenAI&#x2019;s GPT-4o, is that they are proprietary, closed-source, and tied to cloud-based paid services operated by large technology companies. This makes them unsuitable for sensitive clinical data, where local processing and data privacy are essential. Nonetheless, in the future, open-source general-purpose LLMs that can be run on local infrastructure are very likely to become more capable.</p><p>Our research has several limitations that merit consideration. We used a dataset specifically designed for this use case, which is characterized by its short text notes. Documents of different types, especially those with more extensive content such as admission or discharge summaries, might require alternative methods. For example, researchers dealing with longer documents may need to use models such as Longformer [<xref ref-type="bibr" rid="ref12">12</xref>], which are designed to process longer text data. Most transformer-based models, including German BERT, typically have a default input constraint of 128 tokens, which can be extended to a maximum of 512 tokens&#x2014;any text data longer than this is omitted. In the context of the German BERT tokenizer, a token is generally equivalent to a single word. Consequently, training LLMs on longer documents using Longformer models [<xref ref-type="bibr" rid="ref12">12</xref>] could lead to different results. In addition, the text data was manually labeled by medical school graduates, and labeling may be prone to errors. Errors in data labeling can affect model performance and are a common challenge in the data labeling process. Finally, the predictive results of the model trained in this study are specifically tailored for use at the Institute of Primary Care.</p><p>An important consideration when interpreting our results is the <italic>ICD-10</italic> code range Z00-Z99 &#x201C;Factors influencing health status and contact with health services,&#x201D; which primarily captures contextual or encounter-related information rather than explicit disease entities. In routine primary care documentation, such codes may function as a nonspecific &#x201C;catch-all&#x201D; category, particularly when diagnostic certainty is low. To mitigate potential bias, we excluded notes labeled exclusively with these encounter-related categories and retained them only when they co-occurred with disease-related labels, treating them as contextual modifiers rather than stand-alone outcomes. Nonetheless, its continued prevalence reflects real-world documentation practices. Notably, macro- and microaveraged <italic>F</italic><sub>1</sub>-scores were very similar, suggesting that performance was not driven solely by the dominant encounter-related class. Accordingly, overall performance should be interpreted with an emphasis on macroaveraged metrics and alongside per-class results (<xref ref-type="fig" rid="figure2">Figure 2</xref> and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, which contains the executed analysis and full per-class results), providing a more balanced view across both common encounter codes and less frequent disease categories. Moreover, predictions of encounter-related codes should be interpreted as reflecting health care use context rather than disease burden, and the classifier should be viewed as a screening tool rather than an autonomous diagnostic system.</p><p>Beyond these methodological considerations, an important ethical issue is the potential for residual bias amplification given the imbalanced category distribution, mainly due to the overrepresented encounter-related codes. Although we excluded free-text notes labeled exclusively with encounter-related codes, the high prevalence of this class may still influence model behavior. For this reason, transparency regarding model design, training data, and class-specific performance is essential to support responsible use and clinician adoption. The classifier presented in this study is therefore best suited for screening and structuring heterogeneous clinical free-text notes, rather than as an autonomous diagnostic system, and it should not be used in settings requiring very high precision, such as downstream epidemiological analyses. Future research should focus on improving model explainability to support clinical acceptance and to facilitate the identification of potential biases during deployment.</p><p>This study uses supervised multilabel classification, which requires a sufficient number of positive examples per label to learn stable decision boundaries and produce reliable, interpretable performance estimates. We therefore focus on more prevalent <italic>ICD-10</italic> classes. Diagnostic categories with very low prevalence fall into a few-shot or zero-shot regime and cannot be robustly modeled within the same supervised fine-tuning framework. Although we explored prompt-based zero-shot label assignment using an open-source German LLM, performance was inconsistent and substantially inferior to supervised classification. Accordingly, this study concentrates on diagnostic categories with adequate training data for robust supervised model development. Rare conditions remain an important but distinct methodological challenge and may be better addressed through alternative paradigms such as few-shot retrieval-based methods or entailment-based classification.</p><p>For classification, we used <italic>bert-base-german-uncased</italic> as a well-established and widely used baseline model to demonstrate supervised multilabel classification of German free-text notes in primary care. While alternative transformer architectures may offer performance gains, particularly through domain adaptation or different pretraining objectives, systematic comparisons were beyond the scope of this work. Relevant candidates for future evaluation include domain-adapted German biomedical models such as medBERT.de [<xref ref-type="bibr" rid="ref13">13</xref>], GottBERT [<xref ref-type="bibr" rid="ref14">14</xref>], and Me-LLaMA 70B [<xref ref-type="bibr" rid="ref15">15</xref>], as well as alternative pretraining paradigms such as German ELECTRA [<xref ref-type="bibr" rid="ref16">16</xref>], which has shown strong performance in text classification tasks. In addition, multilingual architectures (eg, the multilingual mBERT [<xref ref-type="bibr" rid="ref17">17</xref>]) may be of interest for cross-lingual transfer scenarios. Future research would benefit from systematically assessing trade-offs between predictive performance, computational requirements, and deployment feasibility when applying these architectures to real-world clinical settings.</p><p>In summary, our study demonstrates the feasibility and benefits of training tailored multilabel text classifiers to categorize diagnostic physician notes in primary care. Combined, our work and documentation of the classifier development process lay a solid foundation for applied researchers and health care providers to leverage their own rich datasets for sophisticated statistical analyses and rigorous research. Moreover, after further validation, such multilabel classifiers could potentially also be used for generating more meaningful, precise, and actionable feedback to the participating general practitioners and thus help them improve the quality of care for their patients. We believe that our work is particularly relevant for health care organizations that accumulate vast amounts of data over time, including rich open text records that often exceed the amount of data typically collected by large research studies in a feasible time frame and budget. For example, many inpatient clinics have extensive longitudinal data, both quantitative and qualitative, including detailed patient histories across multiple admissions. However, our work also underscores the importance of high-quality data labeling and thoughtful integration of artificial intelligence into health care workflows to meaningfully enhance patient care.</p></sec></body><back><notes><sec><title>Funding</title><p>This study is supported by a grant from the Federal Quality Commission c/o Federal Office of Public Health, Bern, Switzerland. The funder had no role in study design, data collection and analysis, preparation of the manuscript, and decision to publish.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb2">EMR</term><def><p>electronic medical record</p></def></def-item><def-item><term id="abb3">FIRE</term><def><p>Family Medicine Research Using Electronic Medical Records</p></def></def-item><def-item><term id="abb4"><italic>ICD-10</italic></term><def><p><italic>International Statistical Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaur</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ginige</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Obst</surname><given-names>O</given-names> </name></person-group><article-title>AI-based ICD coding and classification approaches using discharge summaries: a systematic literature review</article-title><source>Expert Syst Appl</source><year>2023</year><month>03</month><volume>213</volume><fpage>118997</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2022.118997</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Perez-Concha</surname><given-names>O</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bennett</surname><given-names>V</given-names> </name><name name-style="western"><surname>Jorm</surname><given-names>L</given-names> </name></person-group><article-title>Automated ICD coding using extreme multi-label long text transformer-based models</article-title><source>Artif Intell Med</source><year>2023</year><month>10</month><volume>144</volume><fpage>102662</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2023.102662</pub-id><pub-id pub-id-type="medline">37783551</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coutinho</surname><given-names>I</given-names> </name><name name-style="western"><surname>Martins</surname><given-names>B</given-names> </name></person-group><article-title>Transformer-based models for ICD-10 coding of death certificates with Portuguese text</article-title><source>J Biomed Inform</source><year>2022</year><month>12</month><volume>136</volume><fpage>104232</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2022.104232</pub-id><pub-id pub-id-type="medline">36307020</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Arnaud</surname><given-names>&#x00C9;</given-names> </name><name name-style="western"><surname>Elbattah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gignon</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dequen</surname><given-names>G</given-names> </name></person-group><article-title>Learning embeddings from free-text triage notes using pretrained transformer models</article-title><conf-name>15th International Joint Conference on Biomedical Engineering Systems and Technologies</conf-name><conf-date>Feb 9-11, 2022</conf-date><conf-loc>Vienna, Austria</conf-loc><fpage>835</fpage><lpage>841</lpage><pub-id pub-id-type="doi">10.5220/0011012800003123</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Padilla Cuevas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Reyes-Ortiz</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Cuevas-Rasgado</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Mora-Guti&#x00E9;rrez</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Bravo</surname><given-names>M</given-names> </name></person-group><article-title>M&#x00E9;dicoBERT: a medical language model for Spanish natural language processing tasks with a question-answering application using hyperparameter optimization</article-title><source>Appl Sci (Basel)</source><year>2024</year><volume>14</volume><issue>16</issue><fpage>7031</fpage><pub-id pub-id-type="doi">10.3390/app14167031</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Iglesia</surname><given-names>I</given-names> </name><name name-style="western"><surname>Atutxa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gojenola</surname><given-names>K</given-names> </name><name name-style="western"><surname>Barrena</surname><given-names>A</given-names> </name></person-group><article-title>EriBERTa: a bilingual pre-trained language model for clinical natural language processing</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 12, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2306.07373</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chmiel</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bhend</surname><given-names>H</given-names> </name><name name-style="western"><surname>Senn</surname><given-names>O</given-names> </name><name name-style="western"><surname>Zoller</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rosemann</surname><given-names>T</given-names> </name><collab>FIRE study-group</collab></person-group><article-title>The FIRE project: a milestone for research in primary care in Switzerland</article-title><source>Swiss Med Wkly</source><year>2011</year><volume>140</volume><fpage>w13142</fpage><pub-id pub-id-type="doi">10.4414/smw.2011.13142</pub-id><pub-id pub-id-type="medline">21279858</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walln&#x00F6;fer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Burgstaller</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Weiss</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rosemann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Senn</surname><given-names>O</given-names> </name><name name-style="western"><surname>Markun</surname><given-names>S</given-names> </name></person-group><article-title>Developing and testing a framework for coding general practitioners&#x2019; free-text diagnoses in electronic medical records - a reliability study for generating training data in natural language processing</article-title><source>BMC Prim Care</source><year>2024</year><month>07</month><day>16</day><volume>25</volume><issue>1</issue><fpage>257</fpage><pub-id pub-id-type="doi">10.1186/s12875-024-02514-1</pub-id><pub-id pub-id-type="medline">39014311</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>MDZ Digital Library team</collab></person-group><article-title>Bert-base-german-uncased</article-title><source>Hugging Face</source><year>2023</year><access-date>2026-04-02</access-date></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wolf</surname><given-names>T</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanh</surname><given-names>V</given-names> </name><etal/></person-group><article-title>HuggingFace&#x2019;s transformers: state-of-the-art natural language processing</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 9, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.03771</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>A survey of automated International Classification of Diseases coding: development, challenges, and applications</article-title><source>Intell Med</source><year>2022</year><month>08</month><volume>2</volume><issue>3</issue><fpage>161</fpage><lpage>173</lpage><pub-id pub-id-type="doi">10.1016/j.imed.2022.03.003</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Beltagy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>AL</given-names> </name></person-group><article-title>The long-document transformer</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 10, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2004.05150</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tinn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title><source>ACM Trans Comput Healthcare</source><year>2022</year><month>01</month><day>31</day><volume>3</volume><issue>1</issue><fpage>1</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1145/3458754</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Scheible</surname><given-names>R</given-names> </name><name name-style="western"><surname>Thomczyk</surname><given-names>F</given-names> </name><name name-style="western"><surname>Tippmann</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jaravine</surname><given-names>V</given-names> </name><name name-style="western"><surname>Boeker</surname><given-names>M</given-names> </name></person-group><article-title>GottBERT: a pure German language model</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 3, 2020</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2012.02110</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Me LLaMA: foundation large language models for medical applications</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 20, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2402.12749</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>K</given-names> </name><name name-style="western"><surname>Luong</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Le</surname><given-names>QV</given-names> </name><name name-style="western"><surname>Manning</surname><given-names>CD</given-names> </name></person-group><article-title>ELECTRA: pre-training text encoders as discriminators rather than generators</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 23, 2020</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2003.10555</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</source><year>2019</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>4171</fpage><lpage>4186</lpage><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Jupyter Notebook containing the Python code and complete model training output.</p><media xlink:href="medinform_v14i1e86533_app1.pdf" xlink:title="PDF File, 766 KB"/></supplementary-material></app-group></back></article>