<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e77409</article-id><article-id pub-id-type="doi">10.2196/77409</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>A Sentence Classification&#x2013;Based Medical Status Extraction Pipeline for Electronic Health Records: Institutional Case Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Dong</surname><given-names>Chuanming</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Delange</surname><given-names>Boris</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Poiron</surname><given-names>Alex</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>El Azzouzi</surname><given-names>Mohamed</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fran&#x00E7;ois</surname><given-names>Cl&#x00E9;ment</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bouzill&#x00E9;</surname><given-names>Guillaume</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cuggia</surname><given-names>Marc</given-names></name><degrees>PhD, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Cabon</surname><given-names>Sandie</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Univ Rennes, CHU Rennes, INSERM, LTSI - UMR 1099, F-35000 Rennes, France</institution><addr-line>Rennes</addr-line><country>France</country></aff><aff id="aff2"><institution>Kereval (France)</institution><addr-line>Rennes</addr-line><country>France</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Garc&#x00ED;a-Barrag&#x00E1;n</surname><given-names>&#x00C1;lvaro</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ma</surname><given-names>Lianbo</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Matos</surname><given-names>S&#x00E9;rgio</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Sandie Cabon, PhD, Univ Rennes, CHU Rennes, INSERM, LTSI - UMR 1099, F-35000 Rennes, France, Rennes, France, +33 02 23 23 62 20; <email>sandie.cabon@univ-rennes.fr</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>26</day><month>3</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e77409</elocation-id><history><date date-type="received"><day>13</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>27</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Chuanming Dong, Boris Delange, Alex Poiron, Mohamed El Azzouzi, Cl&#x00E9;ment Fran&#x00E7;ois, Guillaume Bouzill&#x00E9;, Marc Cuggia, Sandie Cabon. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 26.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e77409"/><abstract><sec><title>Background</title><p>Clinical data warehouses store large volumes of unstructured text containing valuable information about patients&#x2019; medical status. Traditional extraction systems based on named entity recognition (NER) identify medical terms but often fail to capture the contextual cues needed for accurate interpretation. Existing approaches to context-aware extraction differ in their reliance on expert annotation, computational power, and lexical resources, leading to uneven feasibility across institutions. Combined with heterogeneity in documentation practices and data-sharing restrictions, these limitations hinder the scalability and reuse of trained models. There is thus a need for practical frameworks that can be deployed and adapted locally within medical institutions.</p></sec><sec><title>Objective</title><p>This study aimed to introduce the Medical Status Extraction Pipeline (MSEP)<bold>,</bold> a methodological framework that extracts patients&#x2019; medical status from clinical narratives through sentence classification and supports the local deployment of hybrid extractors, illustrated through an institutional case study.</p></sec><sec sec-type="methods"><title>Methods</title><p>MSEP extracts medical status by classifying sentences into predefined categories (presence<italic>,</italic> absence, or unknown) for each targeted condition. The pipeline combines modules for data selection, expert annotation, and model development, with parameters customizable to different settings. It was applied within our institutional environment on 6 conditions: smoking, hypertension, diabetes, heart failure, chronic obstructive pulmonary disease, and family history of cancer, using 12,119 manually annotated sentences from the eHOP Clinical Data Warehouse (Rennes University Hospital). Three types of extractors were compared: fine-tuned CamemBERT, large language model (LLM) prompt, and a rule-based baseline, evaluated through stratified 3-fold cross-validation, measuring precision, recall, specificity, macro <italic>F</italic>-score, balanced accuracy, as well as manual annotation time and model inference speed.</p></sec><sec sec-type="results"><title>Results</title><p>Among the tested approaches, the CamemBERT-based extractor achieved the best overall performance, with macro <italic>F</italic>-scores above 0.94 for 5 of the 6 medical conditions. The study also highlights that when a medical status is very sparsely represented in the training data, rule-based extractors can outperform learned models (average macro <italic>F</italic>-score 0.94 vs 0.73 for family history of cancer). This shows the pragmatic value of choosing the extraction method according to data availability. Manual annotation time per sentence ranged from 1.2 to 2.9 seconds within the pipeline (2.23 to 4.25 seconds for informative sentences), compared with 7.8 to 16.5 seconds for named entity recognition&#x2013;based systems. In our institutional experiments, the minimum time to complete all pipeline modules, from dataset construction to final extractor refinement, was 8 hours.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>In our institutional case study, MSEP enabled rapid construction of datasets and extractors across multiple clinical conditions while reducing the effort required for local development. Its modular and configurable design allowed the adoption of hybrid extraction approaches and adaptation to different resource settings. These features highlight MSEP&#x2019;s value as a research tool and upstream component that facilitates local deployment of clinical information extraction workflows.</p></sec></abstract><kwd-group><kwd>sentence classification</kwd><kwd>information extraction</kwd><kwd>natural language processing</kwd><kwd>clinical data warehouse</kwd><kwd>deep learning</kwd><kwd>large language model</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In recent years, a substantial amount of clinical data has been collected from numerous patients and stored in clinical data warehouses (CDWs) [<xref ref-type="bibr" rid="ref1">1</xref>]. These databases represent a valuable resource for clinical research. In many cases, a significant portion of their content exists as free unstructured texts, such as hospital discharge summaries and emergency reports, which can be difficult to process [<xref ref-type="bibr" rid="ref2">2</xref>]. Nonetheless, these texts contain valuable information about patients&#x2019; medical status that, once extracted, can be leveraged for clinical and epidemiological research [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Harnessing these unstructured data requires developing natural language processing (NLP) methods specifically adapted to the medical domain. Extensive efforts have been made in this area, as reflected in shared tasks such as the n2c2 (National NLP Clinical Challenges) competitions, which have addressed problems of extracting patients&#x2019; family history regarding diseases, smoking, suicide, and drinking [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>According to a recent review, most NLP approaches for medical information extraction rely on named entity recognition (NER) [<xref ref-type="bibr" rid="ref2">2</xref>]. At first, rule-based approaches using dictionaries and terminologies to match terms and concepts were proposed [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. However, their reliance on lexical resources and medical expertise for rule construction limits their long-term viability, since rules and vocabularies require regular manual updates to reflect evolving clinical practices. To cope with this, machine learning approaches arose. These methods take advantage of the large amount of medical knowledge in databases and can automate the construction and maintenance of medical information retrieval systems [<xref ref-type="bibr" rid="ref9">9</xref>]. Its latest branch, deep learning approaches, has improved the ability to capture and use contextual information compared with rule-based and traditional machine learning methods, thanks to their neural network architectures; accordingly, this also makes their performance highly dependent on the availability of sufficient annotated data [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>However, NER-based information extraction approaches present a significant limitation. While NER can effectively extract medical entities (eg, &#x201C;cancer&#x201D; and &#x201C;hypertension&#x201D;), it cannot capture the contextual information required for a full understanding of the patient&#x2019;s status. A comprehensive assessment involves interpreting the complete syntactic and semantic structure, including subject (patient or family member), predicate (having or not having), and object (the medical condition), to determine whether a condition is present, absent, or mentioned in a familial context [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. Standard NER captures only the object component, necessitating additional processing by context qualifiers to detect negation, temporality, and subject attribution [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Clinical narratives further amplify these challenges, as they are typically unstructured, context-dependent, and institution-specific, with such cues often expressed at the sentence or discourse level [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. For example, in &#x201C;Tabagisme pendant 7 ans, reprise r&#x00E9;cente malgr&#x00E9; un sevrage en 2020&#x201D; (Smoking for 7 y, recent relapse despite cessation in 2020), successful interpretation requires resolving the temporal relation between cessation and relapse to infer that the patient is an active smoker, dependencies that standard NER pipelines, which treat entities and assertions separately, struggle to capture. These linguistic characteristics highlight the need for approaches more capable of sentence-level semantic interpretation.</p><p>At the same time, developing context-aware information extraction systems can be resource-intensive. The development and evaluation of NER models require extensive expert annotation to label entity boundaries and associated features within large-scale corpora [<xref ref-type="bibr" rid="ref20">20</xref>], a process made even more laborious by the sparsity of clinically relevant content [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. With the advancement of deep neural networks, embedding fusion techniques have been adopted to enhance context utilization and improve recognition accuracy in NER tasks [<xref ref-type="bibr" rid="ref24">24</xref>]. However, their cross-domain applicability is limited by high computational cost and reliance on external resources for comprehensive feature engineering. To overcome the limitations of traditional NER pipelines and handcrafted feature engineering, recent research has shifted toward the use of transformer-based pretrained language models that inherently capture richer contextual representations [<xref ref-type="bibr" rid="ref25">25</xref>]. Encoder-only language models such as BERT [<xref ref-type="bibr" rid="ref26">26</xref>] and CamemBERT [<xref ref-type="bibr" rid="ref27">27</xref>] have demonstrated high performance in domain-specific tasks through fine-tuning on relatively small annotated datasets [<xref ref-type="bibr" rid="ref28">28</xref>]. Decoder-only large language models (LLMs), such as ChatGPT [<xref ref-type="bibr" rid="ref29">29</xref>] and Mistral [<xref ref-type="bibr" rid="ref30">30</xref>], can achieve similar goals via prompt engineering without requiring fine-tuning. However, both approaches still depend on domain-specific expertise for model adaptation [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>], and decoder-only models introduce substantial computational costs that limit their feasibility for routine institutional deployment [<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>Beyond technical constraints, institutional heterogeneity, including differences in documentation practices, terminologies, and privacy regulations, presents major barriers to developing and sharing information extraction systems across institutions [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. Clinical documentation varies widely across institutions, incorporating nonstandard abbreviations, local terminology, and inconsistent formatting [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref37">37</xref>], which hinders the generalizability and scalability of information extraction models, requiring substantial annotation and model adaptation for each use case [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. The sensitive nature of patient data further restricts access to real-world electronic health records and limits the sharing of pretrained models, due to risks of unintended information leakage [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. Consequently, research teams often need to rebuild extraction systems from scratch, even when addressing similar clinical objectives [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. In light of these limitations, there is a critical need for practical, context-aware information extraction systems that can be developed and reused within institutional boundaries.</p><p>To address these limitations, we designed the Medical Status Extraction Pipeline (MSEP) as a methodological framework that supports the local development of hybrid approaches for information extraction. MSEP reframes medical status extraction as a sentence classification task, which directly leverages contextual cues such as negation, temporality, and subject attribution while reducing annotation complexity. In addition, by accommodating rule-based methods, fine-tuned language models, and LLM prompting, MSEP allows institutions to select extraction approaches suited to their data availability and computational resources.</p><p>To support its interest, the MSEP has been applied to build extractors of patients&#x2019; status for 6 clinically relevant medical conditions [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>] from a French CDW: smoking, hypertension, diabetes, heart failure, chronic obstructive pulmonary disease (COPD), and family history of cancer. This paper first outlines the steps, functions, and requirements of the MSEP pipeline, along with the materials and tools used. Next, we present the evaluation results of extractors built using MSEP and compare them with different medical information retrieval approaches (rule-based methods and prompt-based LLM extraction). Finally, we discuss the operational benefits of MSEP, particularly reduced annotation effort, and its potential use as an upstream research tool within institutional settings.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>The MSEP pipeline classifies sentences in clinical documents as present, absent, or unknown regarding a patient&#x2019;s medical condition. Available as a Python package (repository link in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), the framework includes configurable parameters that allow its components to be adapted to local research needs and institutional constraints. In the following sections, we first describe the general design of the pipeline and then detail its implementation within our institutional setting.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Ethics approval for data reuse was obtained from the Commission Nationale de l'Informatique et des Libert&#x00E9;s (agreement number 2206739). This study was conducted on pseudonymized secondary clinical data provided by the Clinical Data Center of Rennes Hospital. In accordance with French regulations and the guidelines of the Commission Nationale de l'Informatique et des Libert&#x00E9;s. Patients were informed of the potential use of their data for research purposes through the hospital's standard information notice. All data were pseudonymized by the Clinical Data Center before being transmitted to the research team. No identifying information was accessible at any stage of the analysis. No compensation was applicable or provided to participants.</p></sec><sec id="s2-3"><title>Generic Pipeline Design</title><sec id="s2-3-1"><title>Overview</title><p>The global framework of our method is depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The framework consists of two main stages: (1) dataset creation and (2) model training with cross-validation. In order to facilitate the comprehension of the framework, the abbreviations of the steps in the figure are also mentioned in the text.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the proposed iterative medical status extraction system. This pipeline is primarily designed to extract the status of one medical condition at a time; it may repeat multiple times (iterations) to improve the extractors. This diagram mainly shows the process of the first iteration.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e77409_fig01.png"/></fig></sec><sec id="s2-3-2"><title>Dataset Creation</title><sec id="s2-3-2-1"><title>Overview</title><p>The objective of this stage is to construct the datasets required for training and validating medical status extractors. It involves three processes: (1) data collection and preprocessing, (2) data qualification, and (3) manual annotation and correction.</p></sec><sec id="s2-3-2-2"><title>Data Collection and Preprocessing</title><p>Data collection gathers unstructured texts from CDW (P1), which are preprocessed by segmenting into sentences for classification (P2). The output is a corpus of sentences with relevant metadata that maintains the relationship to their source documents.</p></sec><sec id="s2-3-2-3"><title>Data Qualification</title><p>The input to the data qualification process is the preprocessed sentence corpus. This phase assesses whether the collected data have appropriate sample density and size for effective model training. Sentences undergo automatic preannotation (D1), preferably using quickly implemented rule-based approaches that match medical status keywords. When preannotating sentences from all documents is impractical due to time and resource constraints, a random subset may be selected. If sample density is low, based on preannotation results, a portion of sentences unlikely to contain medical status information is filtered out (D2). This supervised sampling approach is designed to enhance annotation efficiency and final data quality by optimizing the training sample structure. The output is a preannotated sentence dataset with optimized annotation density and a balanced representation of target medical statuses.</p></sec><sec id="s2-3-2-4"><title>Manual Annotation and Correction</title><p>The preannotated sentences serve as input for manual annotation, which requires multiple medical specialists to classify them according to annotation guidelines (M1). Annotators should collaboratively annotate a subset of the sentences to calculate interannotator agreement (M2). If the agreement exceeds the predetermined threshold, the annotations form the training and validation datasets. Otherwise, disagreements are analyzed (M3) to improve guidelines, with corrections applied to the annotated data accordingly (M4). The correction approach depends on the type of disagreements, but preferably through manual review. The disagreements analysis-correction process can be repeated until the interannotator agreement exceeds the threshold. The output is a validated annotated dataset ready for model development.</p></sec></sec></sec><sec id="s2-4"><title>Model Training and Cross-Validation</title><p>The annotated datasets become the input for training or fine-tuning neural network models for medical status extraction. Since the pipeline is designed to extract the status of one medical condition at a time, each corresponding model is trained in a multiclass classification setting, where the possible status labels (eg, present, absent, and unknown) are mutually exclusive. Model stability is assessed through cross-validation by dividing annotated data into several groups (C1), using one group for validation (C3) and the remainder for training (C2) in each session. The number of cross-validation sessions equals the number of groups of data. The data grouping strategy for cross-validation should align with study objectives; for example, testing model stability across different document types requires data partitioning by document type. The extractor&#x2019;s performance metrics across multiple validation sets are assessed and compared with extractors based on other approaches, if available, to decide whether it has stable and state-of-the-art performance (C4). The flexibility of the pipeline configuration allows for easy benchmarking of different models (such as BERT-based models, rule-based systems, or LLM prompts), facilitating comprehensive performance comparison with minimal additional setup.</p><p>If the assessment result is positive, the pipeline ends, and the output is a trained medical status extraction model with satisfactory performance.</p></sec><sec id="s2-5"><title>Iterative Improvement Process</title><p>If, according to the assessment step (C4), the extractor does not achieve satisfactory performance (eg, a macro <italic>F</italic>-score below 90%), the pipeline allows for iterative refinement. The definition of &#x201C;satisfactory&#x201D; performance may vary depending on the objectives and standards of different research projects.</p><p>The iterative process involves both quantitative and qualitative improvements. Quantitatively, additional annotated sentences are incrementally introduced to expand the training and validation datasets. The number of added samples per iteration is not fixed and depends on multiple factors, including the time allocated for manual annotation, the number and availability of annotators, and computing resources.</p><p>Qualitatively, the pipeline may be refined by improving preannotation rules, updating annotation guidelines based on annotator feedback, or replacing rules with more accurate extractors from previous iterations for preannotation. The first iteration serves as an exploratory phase to identify bottlenecks, such as insufficient data for certain labels, weak vocabulary coverage in preannotation rules, or annotation inconsistencies, allowing for targeted improvements in subsequent iterations.</p></sec><sec id="s2-6"><title>Study-Specific Implementation</title><sec id="s2-6-1"><title>Data Source and Working Corpus</title><p>We used the eHOP CDW from Rennes University Hospital, which contains 80 million unstructured and 430 million structured data elements covering over 1.8 million patients [<xref ref-type="bibr" rid="ref44">44</xref>]. For this study, we extracted unstructured documents from our previous bladder cancer research dataset [<xref ref-type="bibr" rid="ref3">3</xref>], obtaining 799,470 documents from 5398 patients. This corpus encompasses diverse medical document types including nursing care reports, medical reports, multidisciplinary follow-up &#x0026; daily notes, imaging reports, prescriptions, and medication orders (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Numbers and ratios of the 9 most frequently found document types (as well as the rest of the 5020 documents in &#x201C;Other&#x201D;) in our corpus.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Medical document type</td><td align="left" valign="top">Value, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Nursing care&#x2013;procedures</td><td align="char" char="." valign="top">329,359 (41)</td></tr><tr><td align="left" valign="top">Medical reports</td><td align="char" char="." valign="top">187,370 (23)</td></tr><tr><td align="left" valign="top">Nursing care&#x2013;monitoring and vitals</td><td align="char" char="." valign="top">153,865 (19)</td></tr><tr><td align="left" valign="top">Multidisciplinary follow-up and daily notes</td><td align="char" char="." valign="top">54,187 (7)</td></tr><tr><td align="left" valign="top">Imaging reports</td><td align="char" char="." valign="top">31,430 (4)</td></tr><tr><td align="left" valign="top">Prescriptions and medication orders</td><td align="char" char="." valign="top">13,163 (2)</td></tr><tr><td align="left" valign="top">Paramedical care</td><td align="char" char="." valign="top">13,083 (2)</td></tr><tr><td align="left" valign="top">Administrative and admission</td><td align="char" char="." valign="top">6839 (1)</td></tr><tr><td align="left" valign="top">Results and pathology reports</td><td align="char" char="." valign="top">5225 (1)</td></tr><tr><td align="left" valign="top">Other</td><td align="char" char="." valign="top">5020 (1)</td></tr></tbody></table></table-wrap></sec><sec id="s2-6-2"><title>Rationale for Extraction Method Design</title><p>MSEP is designed as a hybrid framework capable of accommodating different resource-availability scenarios. To assess this flexibility, we evaluated 3 extraction approaches that represent distinct requirements: fine-tuning an encoder-only transformer (high annotation demand and moderate computational cost), rule-based extraction (minimal computation and annotation requirements but substantial reliance on domain expertise), and LLM prompting (low annotation requirements but high computational and expertise demands) [<xref ref-type="bibr" rid="ref45">45</xref>]. Our experiments focused primarily on encoder-only fine-tuning in order to assess how effectively MSEP can reduce the associated annotation burden in practice, while the rule-based and LLM-prompting approaches served as comparative baselines for low-annotation settings. The rule-based approach is used for preannotation (D1) before fine-tuning, reinforcing the hybrid nature of the MSEP framework.</p><p>Regarding the models and tools used in these approaches, CamemBERT was selected for encoder-only fine-tuning strategy because it aligns with the sentence-classification paradigm and has demonstrated strong performance in French NLP tasks [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref47">47</xref>]. Furthermore, we opted for the original CamemBERT rather than domain-adapted variants such as CamemBERT-bio [<xref ref-type="bibr" rid="ref48">48</xref>], as the latter are pretrained on specialized biomedical corpora, which may bias them toward known biomedical tasks and entities [<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. Using a general-domain model allows us to evaluate the pipeline&#x2019;s components without relying on domain-specific pretraining and to examine how it behaves in our institutional corpus when applied to varied clinical conditions. For LLM prompting, we used Mixtral-8&#x00D7;7B-v0.1 [<xref ref-type="bibr" rid="ref51">51</xref>], which offered the strongest performance that could be efficiently supported by our institutional infrastructure (an NVIDIA A100 40 GB GPU). The rule-based extractors were developed using regular expressions and terminology specific to each targeted medical condition, provided by medical experts.</p></sec><sec id="s2-6-3"><title>Pipeline Configuration</title><p>The pipeline was independently evaluated on 6 medical conditions: smoking, hypertension, diabetes, heart failure, COPD, and family history of cancer. For each condition, we defined 3 possible statuses: &#x201C;present,&#x201D; &#x201C;absent,&#x201D; and &#x201C;unknown,&#x201D; corresponding to confirmed presence, negation, and uncertainty regarding the patient&#x2019;s condition. For the smoking condition, an additional status, &#x201C;former,&#x201D; was included to indicate that the patient previously smoked but has since quit (Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> provides a detailed definition of medical statuses). <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> contains the annotation guidelines, which give more details about the criteria for annotating the medical status.</p><p>We used Spacy&#x2019;s fr_core_news_md model [<xref ref-type="bibr" rid="ref52">52</xref>] for sentence segmentation (P2) and manually crafted rules based on specialist-provided keywords for preannotation (D1), supplemented by EDS-NLP&#x2019;s negation and family context qualifiers [<xref ref-type="bibr" rid="ref53">53</xref>] for smoking and family history of cancer. Two medical specialists performed annotations (M1) using the Prodigy interface [<xref ref-type="bibr" rid="ref54">54</xref>], with Cohen kappa [<xref ref-type="bibr" rid="ref55">55</xref>] measuring interannotator agreement (M2), which is defined as:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>&#x03BA;</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>P</mml:mi><mml:mi>e</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>P</mml:mi><mml:mi>e</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>Po</italic> is the observed agreement and <italic>Pe</italic> is the expected agreement by chance. A threshold of 0.8 was used to determine acceptable agreement.</p><p>For CamemBERT-based extractors, we performed a 3-fold stratified cross-validation. The full annotated dataset was randomly partitioned into 3 equally sized folds (C1), preserving the proportion of annotated categories within each fold. Duplicated (identical) sentences were removed before the partition to prevent data leakage across folds. In each of the 3 cross-validation rounds, two folds were used for training and the remaining fold served as the validation set (C2). We trained the CamemBERT-large model [<xref ref-type="bibr" rid="ref27">27</xref>] with loss weighting to address class imbalance. The weighted cross-entropy loss with class weighting is defined as:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mo>&#x03A3;</mml:mo></mml:mrow><mml:mi>i</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>w</mml:mi><mml:mi>i</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>x</mml:mi><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mi>w</mml:mi><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mi>N</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>N</mml:mi><mml:mi>y</mml:mi><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>p(yi | xi</italic>) is the predicted probability for the true class <italic>yi</italic> of input <italic>xi</italic>, <italic>wi</italic> is the class-specific weight, <italic>N</italic> is the total number of samples, <italic>C</italic> is the number of classes, and <italic>Nyi</italic> is the number of samples in the class of example <italic>i</italic>. Model performance was evaluated (C3) using precision, recall, specificity, <italic>F</italic>-score on individual status, balanced accuracy, and macro <italic>F</italic>-score on all statuses of a medical condition. Balanced accuracy was defined as:</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtable columnalign="" rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mi>B</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>C</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mo>&#x03A3;</mml:mo></mml:mrow><mml:mi>c</mml:mi><mml:mo stretchy="false">[</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mi>c</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mi>c</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mi>c</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mi>c</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mi>c</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mi>c</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mn>2</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>and macro <italic>F</italic>-score was defined as:</p><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>C</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mo>&#x03A3;</mml:mo></mml:mrow><mml:mi>c</mml:mi><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo stretchy="false">(</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>C</italic> is the number of classes, and <italic>TPc, FNc, TNc,</italic> and <italic>FPc</italic> are respectively the true positives, false negatives, true negatives, and false positives for class <italic>c</italic>. This set of metrics is supposed to cover all necessary aspects of evaluation for an information retrieval system according to [<xref ref-type="bibr" rid="ref56">56</xref>]. For comparison (C4), we evaluated our rule-based extractors and Mixtral-8x7B-v0.1 LLM prompts on the same validation sets used for the cross-validation (C2+C3). To ensure a quite fair comparison, the Mixtral-8x7B prompts were designed using few-shot examples derived from the annotation guidelines, combining basic extraction instructions with expert-crafted example and counter-example sentences embedding specialist-provided keywords and clinical context narration.</p><p>Detailed configuration information for each step in MSEP appears in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The MSEP python package (repository link in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) provides configurable modules to realize each step of the pipeline.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>We conducted 2 iterations of the pipeline in our institutional setting to create and refine the extractors, after which most targeted medical conditions reached satisfactory performance (average macro <italic>F</italic>-score&#x003E;0.94) for our use case. This section presents the results of these iterations, describing implementation paths, dataset characteristics, cross-validation results, and comparisons between our extractors and alternative approaches.</p></sec><sec id="s3-2"><title>Pipeline Execution and Time Requirements</title><p><xref ref-type="fig" rid="figure2">Figure 2</xref> details the implementation path and time consumption for each medical condition across both iterations.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Steps in the 2 iterations of the pipeline were realized to extract each medical status. The steps within the curly braces mean that they are repeatedly realized within one iteration, and the number after the * marks how many times these steps have been repeated. The color of the steps is consistent with <xref ref-type="fig" rid="figure1">Figure 1</xref>; blue means automated steps and purple means manual steps. The type of model used for preannotation (PA) and the interannotator agreement score calculated at step M2 are given underneath the list of steps. The last 4 columns show the total time consumption of preannotation (D1), manual annotation (M1) during both iterations, model fine-tuning during cross-validation (C2), and the total time consumed to go through the pipeline in 2 iterations (Ttl). The time consumption is calculated by hours (h) or by minutes (m), depending on the convenience of data representation. COPD: chronic obstructive pulmonary disease.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e77409_fig02.png"/></fig><p>Most medical conditions were preannotated using rule-based matching, except smoking status, which was preannotated in the second iteration using a CamemBERT-based extractor fine-tuned from the first, which outperformed the rules. Heart failure annotation during the second iteration initially yielded insufficient interannotator agreement (57%), necessitating disagreement analysis and correcting annotated data. The analysis revealed that the inconsistencies arose from concept confusion among annotators. A list of ambiguous terms was identified and clarified with a medical expert, incorporated into updated annotation guidelines, and used to automatically extract sentences that need revision from all annotations. These sentences were then reviewed and corrected by the annotators based on the updated guidelines.</p><p>Time requirements varied by medical condition and process step. We focused on the 3 most time-intensive steps: preannotation (D1), manual annotation (M1), and extractor training during cross-validation (C2). Preannotation (D1) speed ranged from 0.02 to 0.17 seconds per sentence, with COPD and diabetes being fastest (0.02 s/sentence) and smoking and family history of cancer slowest (0.12&#x2010;0.17 s/sentence). Manual annotation (M1) required 1.2&#x2010;2.9 seconds per sentence when considering all status labels, with diabetes and family history of cancer being the most efficient (1.2 s/sentence) and smoking being the most time-intensive (2.9 s/sentence). After removing the speed bias introduced by quickly dismissed sentences labeled as unknown, annotation speed ranged from 2.23 to 4.25 s/sentence, with hypertension being the fastest and family history of cancer the slowest, indicating that annotation speed is partly driven by sentence complexity and semantic load. Cross-validation training (C2) averaged 5&#x2010;22 min per session (of cross-validation), with COPD training being fastest (5 m/session) and hypertension slowest (22 m/session). Final extractor inference speed ranged from 0.09 to 0.12 s per sentence. Total implementation time across both iterations (Ttl) ranged from 8 hours (diabetes) to 30 hours (smoking). The time consumption and speed of all automatic steps (D1, C2, and inference of extractors) are calculated on a GPU of NVIDIA A100 40 GB graphics card.</p></sec><sec id="s3-3"><title>Training and Validation Datasets</title><p>The first iteration involved quickly preannotating 9997 medical documents and selecting 991 sentences from them for manual annotation. In order to improve extractors&#x2019; performance by training them with more medical status samples, the second iteration preannotated an additional 18,994 documents, yielding 11,128 sentences for manual annotation. <xref ref-type="table" rid="table2">Table 2</xref> shows the distribution of medical status in sentences annotated during each iteration. The training and validation dataset for the 2nd iteration of the pipeline is the combination of all sentences annotated during both iterations (12,119 sentences in total).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Number of samples of different medical status in sentences respectively annotated during the 1st (itr1) and the 2nd (itr2) iteration of the pipeline.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Medical condition</td><td align="left" valign="bottom" colspan="2">Present</td><td align="left" valign="bottom" colspan="2">Absent</td><td align="left" valign="bottom" colspan="2">Former</td><td align="left" valign="bottom" colspan="2">Unknown</td><td align="left" valign="bottom" colspan="2">Total</td></tr><tr><td align="left" valign="top"/><td align="left" valign="bottom">itr1</td><td align="left" valign="bottom">itr2</td><td align="left" valign="bottom">itr1</td><td align="left" valign="bottom">itr2</td><td align="left" valign="bottom">itr1</td><td align="left" valign="bottom">itr2</td><td align="left" valign="bottom">itr1</td><td align="left" valign="bottom">itr2</td><td align="left" valign="bottom">itr1</td><td align="left" valign="bottom">itr2</td></tr></thead><tbody><tr><td align="left" valign="top">Smoking</td><td align="left" valign="top">75</td><td align="left" valign="top">646</td><td align="left" valign="top">42</td><td align="left" valign="top">211</td><td align="left" valign="top">87</td><td align="left" valign="top">417</td><td align="left" valign="top">787</td><td align="left" valign="top">9854</td><td align="left" valign="top">991</td><td align="left" valign="top">11,128</td></tr><tr><td align="left" valign="top">Diabetes</td><td align="left" valign="top">279</td><td align="left" valign="top">257</td><td align="left" valign="top">29</td><td align="left" valign="top">37</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">683</td><td align="left" valign="top">10,834</td><td align="left" valign="top">991</td><td align="left" valign="top">11,128</td></tr><tr><td align="left" valign="top">Hypertension</td><td align="left" valign="top">217</td><td align="left" valign="top">1184</td><td align="left" valign="top">8</td><td align="left" valign="top">30</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">766</td><td align="left" valign="top">9914</td><td align="left" valign="top">991</td><td align="left" valign="top">11,128</td></tr><tr><td align="left" valign="top">Heart failure</td><td align="left" valign="top">46</td><td align="left" valign="top">183</td><td align="left" valign="top">41</td><td align="left" valign="top">217</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">904</td><td align="left" valign="top">10,728</td><td align="left" valign="top">991</td><td align="left" valign="top">11,128</td></tr><tr><td align="left" valign="top">COPD<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">22</td><td align="left" valign="top">366</td><td align="left" valign="top">0<italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></italic></td><td align="left" valign="top">1<italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></italic></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">969</td><td align="left" valign="top">10,761</td><td align="left" valign="top">991</td><td align="left" valign="top">11,128</td></tr><tr><td align="left" valign="top">Family history of cancer</td><td align="left" valign="top">5</td><td align="left" valign="top">2</td><td align="left" valign="top">0<italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></italic></td><td align="left" valign="top">1<italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></italic></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">986</td><td align="left" valign="top">11,125</td><td align="left" valign="top">991</td><td align="left" valign="top">11,128</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Not applicable.</p></fn><fn id="table2fn2"><p><sup>b</sup>COPD: chronic obstructive pulmonary disease.</p></fn><fn id="table2fn3"><p><sup>c</sup>Status with too few samples for cross-validation.</p></fn></table-wrap-foot></table-wrap><p>Sample sparsity was evident for certain status classifications. During the first iteration, most conditions had fewer than 300 samples per status, with absence samples typically fewer than presence samples. No samples were found for the absence of COPD and family history of cancer. We then increased preannotated documents for the second iteration, which successfully increased sample counts for most status classifications, though samples remained limited due to the absence of COPD and both the presence and absence of family history of cancer.</p><p>Among the 12,119 sentences, approximately 10% (n=1200) were randomly selected from the medical documents without preannotation, yielding an unbiased subset of 1200 sentences (unfiltered sentences [US]). We compared the distribution of medical statuses in this subset with that of the remaining 10,919 sentences (preannotated filtered sentences [PS]) selected through preannotation to assess the extent of bias introduced by this data qualification process (D1, D2).</p><p><xref ref-type="table" rid="table3">Table 3</xref> shows the result of this comparison. For each medical condition, we calculated the proportion for each of its statuses within US and PS, which is defined as:</p><disp-formula id="E5"><label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>c</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>f</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of medical status samples&#x2019; distribution (%) in 10,919 preannotated filtered sentences (PS) and 1200 unfiltered sentences (US).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Status</td><td align="left" valign="bottom" colspan="2">Smoking</td><td align="left" valign="bottom" colspan="2">Diab</td><td align="left" valign="bottom" colspan="2">Hyper</td><td align="left" valign="bottom" colspan="2">CHF<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="2">COPD<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="2">Fam</td></tr><tr><td align="left" valign="top"/><td align="left" valign="bottom">US</td><td align="left" valign="bottom">PS</td><td align="left" valign="bottom">US</td><td align="left" valign="bottom">PS</td><td align="left" valign="bottom">US</td><td align="left" valign="bottom">PS</td><td align="left" valign="bottom">US</td><td align="left" valign="bottom">PS</td><td align="left" valign="bottom">US</td><td align="left" valign="bottom">PS</td><td align="left" valign="bottom">US</td><td align="left" valign="bottom">PS</td></tr></thead><tbody><tr><td align="left" valign="top">Present</td><td align="left" valign="top">44</td><td align="left" valign="top">49</td><td align="left" valign="top">100</td><td align="left" valign="top">89</td><td align="left" valign="top">100</td><td align="left" valign="top">97.3</td><td align="left" valign="top">33.3</td><td align="left" valign="top">47.1</td><td align="left" valign="top">100</td><td align="left" valign="top">99.7</td><td align="left" valign="top">0</td><td align="left" valign="top">88</td></tr><tr><td align="left" valign="top">Absent</td><td align="left" valign="top">12</td><td align="left" valign="top">17</td><td align="left" valign="top">0</td><td align="left" valign="top">11</td><td align="left" valign="top">0</td><td align="left" valign="top">2.7</td><td align="left" valign="top">66.7</td><td align="left" valign="top">52.9</td><td align="left" valign="top">0</td><td align="left" valign="top">0.3</td><td align="left" valign="top">0</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">Former</td><td align="left" valign="top">44</td><td align="left" valign="top">34</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Prevalence of relevant statuses</td><td align="left" valign="top">0.75</td><td align="left" valign="top">13.5</td><td align="left" valign="top">0.25</td><td align="left" valign="top">5.49</td><td align="left" valign="top">0.75</td><td align="left" valign="top">13.1</td><td align="left" valign="top">0.25</td><td align="left" valign="top">4.43</td><td align="left" valign="top">0.17</td><td align="left" valign="top">3.54</td><td align="left" valign="top">0</td><td align="left" valign="top">0.07</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>CHF: congestive heart failure. </p></fn><fn id="table3fn2"><p><sup>b</sup>COPD: chronic obstructive pulmonary disease.</p></fn><fn id="table3fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>where <italic>c</italic>() denotes the total number of samples of one or more statuses in the set. Since the purpose of the data qualification is specifically to reduce the proportion of sentences that do not express any status information (ie, the class unknown), this category was excluded from the computation to avoid inflating the proportions of relevant statuses (ie, present, absent, and former) in the PS and obscuring the assessment of distributional changes. Instead, we report the prevalence of relevant statuses in US and PS to quantify how effectively steps D1+D2 have increased relevant samples&#x2019; density in PS by filtering out irrelevant sentences (class unknown). This measurement is defined as:</p><disp-formula id="E6"><label>(6)</label><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtable columnalign="" rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>v</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>f</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>c</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>f</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mo>,</mml:mo><mml:mi>u</mml:mi><mml:mi>n</mml:mi><mml:mi>k</mml:mi><mml:mi>n</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Across most medical conditions, the distribution of status labels remained stable after the data filtering steps. The proportions of present status for smoking, diabetes, hypertension, and COPD, as well as the absent status for COPD and hypertension, showed only minor deviations between US and PS. A modest shift was observed for the former and absent statuses of smoking, whereas heart failure exhibited a more substantial change in the present status (from 33.3%, 1/3 in US to 47.1%, 228/484 in PS). Larger apparent shifts for rare categories, such as family history of cancer or the absent status of hypertension, reflect the extremely small number (or absence) of these samples in US, making baseline proportions unstable and limiting interpretability. Overall, despite shifts in certain conditions, the relative ordering of status frequencies remained consistent, suggesting preservation of the dominant patterns. Finally, the prevalence values show that the filtering steps substantially increased the density of relevant samples. For most conditions, the proportion of samples in PS was more than 10 times higher than in US (eg, from 0.75%, 9/1200 to 13.45%, 1469/10,919 for smoking, and from 0.25%, 3/1200 to 5.49%, 599/10,919 for diabetes), demonstrating the effectiveness of steps D1+D2 in removing irrelevant sentences while introducing only minor distributional bias.</p></sec><sec id="s3-4"><title>CamemBERT-Based Medical Status Extractors Evaluation</title><p>As a reminder, each extractor was trained in a multiclass classification setting to predict the status of a single medical condition. Evaluation metrics include precision, recall, specificity, and <italic>F</italic>-score for each individual status, as well as balanced accuracy and macro <italic>F</italic>-score aggregated across all statuses. <xref ref-type="table" rid="table4">Table 4</xref> presents the performance metrics for the best-performing extractor of each medical condition after the second iteration. Extractors for diabetes and heart failure achieved excellent overall performance (macro <italic>F</italic>-scores of 99% and 96%, respectively). The COPD extractor achieved a macro <italic>F</italic>-score of 99%, although evaluation was limited to the present and unknown classes due to the lack of absent samples. The diabetes, hypertension, and COPD extractors exceeded 98% <italic>F</italic>-score for presence detection. The smoking extractor achieved balanced performance across status classifications (93%&#x2010;94% <italic>F</italic>-scores), while the hypertension extractor showed imbalanced performance between presence (98% <italic>F</italic>-score) and absence (86% <italic>F</italic>-score). The extractor for family history of cancer underperformed with only 80% <italic>F</italic>-score for presence detection.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Performance of the best CamemBERT-large based extractors obtained after the cross-validation of the 2nd iteration of the pipeline. For each of the studied 6 medical conditions&#x2014;smoking, diabetes (diab), hypertension (hyper), congestive heart failure (CHF), chronic obstructive pulmonary disease (COPD), and family history of cancer (fam)&#x2014;the extractor is evaluated on each of its status (unknown, absent, present, or former) with 4 metrics: <italic>F</italic>-score, precision, recall, and specificity. The macro <italic>F</italic>-score and balanced accuracy are calculated on (and only on) the evaluated status.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Extractor</td><td align="left" valign="bottom">Smoking</td><td align="left" valign="bottom">Diab</td><td align="left" valign="bottom">Hyper</td><td align="left" valign="bottom">CHF</td><td align="left" valign="bottom">COPD</td><td align="left" valign="bottom">Fam</td></tr></thead><tbody><tr><td align="left" valign="top">Unknown <italic>F</italic>-score</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td></tr><tr><td align="left" valign="top">Unknown Precision</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.0</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.96</td><td align="left" valign="top">1.0</td><td align="left" valign="top">1.0</td></tr><tr><td align="left" valign="top">Unknown Recall</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td></tr><tr><td align="left" valign="top">Unknown Specificity</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.0</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.94</td><td align="left" valign="top">1.0</td><td align="left" valign="top">1.0</td></tr><tr><td align="left" valign="top">Absent <italic>F</italic>-score</td><td align="left" valign="top">0.94<bold><sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></bold></td><td align="left" valign="top">1.0<bold><sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></bold></td><td align="left" valign="top">0.86<italic><sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></italic></td><td align="left" valign="top">0.97<bold><sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></bold></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Absent Precision</td><td align="left" valign="top">0.94</td><td align="left" valign="top">1.0</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.98</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Absent Recall</td><td align="left" valign="top">0.93</td><td align="left" valign="top">1.0</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.96</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Absent Specificity</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.0</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Present <italic>F</italic>-score</td><td align="left" valign="top">0.93<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.99<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.98<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.92<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.99<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.80<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">Present Precision</td><td align="left" valign="top">0.91</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.67</td></tr><tr><td align="left" valign="top">Present Recall</td><td align="left" valign="top">0.95</td><td align="left" valign="top">1.0</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.89</td><td align="left" valign="top">1.0</td><td align="left" valign="top">1.0</td></tr><tr><td align="left" valign="top">Present Specificity</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td></tr><tr><td align="left" valign="top">Former <italic>F</italic>-score</td><td align="left" valign="top">0.94<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Former Precision</td><td align="left" valign="top">0.93</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Former Recall</td><td align="left" valign="top">0.94</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Former Specificity</td><td align="left" valign="top">0.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Macro <italic>F</italic>-score</td><td align="left" valign="top">0.95<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.99<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.94<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.96<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.99<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.89<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">Balanced accuracy</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Status (other than Unknown) whose <italic>F</italic>-score has surpassed 90%.</p></fn><fn id="table4fn2"><p><sup>b</sup><italic>F</italic>-score has not surpassed 90%.</p></fn><fn id="table4fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table5">Table 5</xref> presents the results of the 3-fold cross-validation from both iterations. For each medical condition, we evaluated the stability of the extractors by reporting the mean and SD of the <italic>F</italic>-score for each status category, as well as for the overall macro <italic>F</italic>-score, across the cross-validation folds. All extractors demonstrated improved performance after the second iteration. After the first iteration, only the diabetes extractor achieved a macro <italic>F</italic>-score exceeding 90%, while by the second iteration, most condition extractors surpassed this threshold. Hypertension, heart failure, and COPD extractors showed the most substantial improvement (&#x003E;30%).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Result (average <italic>F</italic>-score, SD) of 3-fold stratified cross-validation with CamemBERT-large based extractors trained during the 1st (iter 1) and the 2nd (iter 2) iterations of the pipeline.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Medical condition</td><td align="left" valign="bottom">Present <italic>F</italic>-score, mean (SD)</td><td align="left" valign="bottom">Absent <italic>F</italic>-score, mean (SD)</td><td align="left" valign="bottom">Former <italic>F</italic>-score, mean (SD)</td><td align="left" valign="bottom">Unknown <italic>F</italic>-score, mean (SD)</td><td align="left" valign="bottom">Macro <italic>F</italic>-score, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6">Iter 1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Smoking</td><td align="left" valign="top">0.78 (0.02)</td><td align="left" valign="top">0.70 (0.03)</td><td align="left" valign="top">0.88 (0.005)</td><td align="left" valign="top">0.99 (0.02)</td><td align="left" valign="top">0.83 (0.02)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diabetes</td><td align="left" valign="top">0.89 (0.04)</td><td align="left" valign="top">0.99 (0.01)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">0.98 (0.004)</td><td align="left" valign="top">0.95 (0.03)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hypertension</td><td align="left" valign="top">0.94 (0.002)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.97 (0.003)</td><td align="left" valign="top">0.63 (0.003)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Heart failure</td><td align="left" valign="top">0.24 (0.03)</td><td align="left" valign="top">0.14 (0.1)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.71 (0.09)</td><td align="left" valign="top">0.36 (0.05)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>COPD<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.93 (0.04)</td><td align="left" valign="top">0.31 (0.01)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Family history of cancer<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.99 (0.003)</td><td align="left" valign="top">0.33 (0.001)</td></tr><tr><td align="left" valign="top" colspan="6">Iter 2</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Smoking</td><td align="left" valign="top">0.93 (0.008)</td><td align="left" valign="top">0.93 (0.005)</td><td align="left" valign="top">0.92 (0.01)</td><td align="left" valign="top">0.98 (0.0004)</td><td align="left" valign="top">0.94 (0.003)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diabetes</td><td align="left" valign="top">0.98 (0.008)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">0.98 (0.02)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.98 (0.004)</td><td align="left" valign="top">0.98 (0.01)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hypertension</td><td align="left" valign="top">0.98 (0.002)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">0.85 (0.01)<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.98 (0.002)</td><td align="left" valign="top">0.96 (0.009)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Heart failure</td><td align="left" valign="top">0.90 (0.02)</td><td align="left" valign="top">0.95 (0.02)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.97 (0.003)</td><td align="left" valign="top">0.94 (0.003)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>COPD</td><td align="left" valign="top">0.99 (0.002)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.99 (0.003)</td><td align="left" valign="top">0.99 (0.002)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Family history of cancer<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">0.55 (0.37)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.99 (0.003)</td><td align="left" valign="top">0.73 (0.17)</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Average <italic>F</italic>-score more than 95% at detecting medical status other than Unknown (either iteration). </p></fn><fn id="table5fn2"><p><sup>b</sup>Not applicable.</p></fn><fn id="table5fn3"><p><sup>c</sup>COPD: chronic obstructive pulmonary disease.</p></fn><fn id="table5fn4"><p><sup>d</sup>For family history of cancer, with only 7 positive samples across 2 iterations, calculating a SD of 37% for the <italic>F</italic>-score is statistically unstable and potentially misleading regarding the extractor&#x2019;s reliability.</p></fn><fn id="table5fn5"><p><sup>e</sup>Average <italic>F</italic>-score less than 90% after the second iteration.</p></fn></table-wrap-foot></table-wrap><p>Despite these improvements, the family history of cancer extractor achieved only 73% macro <italic>F</italic>-score with 17% instability, primarily due to poor performance in detecting presence (55% <italic>F</italic>-score, 37% instability). The hypertension extractor also showed suboptimal performance for absence detection (85% <italic>F</italic>-score, 1% instability). Due to insufficient samples, absence classification for COPD and family history of cancer could not be trained or evaluated.</p><p>To go one step deeper into the assessment of the extractor performances, a post hoc analysis according to document type was conducted. <xref ref-type="table" rid="table6">Table 6</xref> reports the average <italic>F</italic>-scores with SD, obtained during three-fold cross-validation during the last iteration of the pipeline, for each medical status across the major clinical document types represented in the dataset.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Average <italic>F</italic>-scores (SD) per medical status (present: pr, absent: ab, former: fm, and unknown: un) across clinical document types: Nursing care&#x2013;Procedures (Nurs-Proc), Medical reports (Med report), Nursing care&#x2013;Monitoring and vitals (Nurs-Monit), Multidisciplinary follow-up and daily notes (daily note), Imaging reports (image), Prescriptions and medication orders (Presc), Paramedical care (Paramed), Administrative and admission (Admin), Results and pathology reports (Patho), and Other.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Condition and status</td><td align="left" valign="bottom">Nurs-Proc</td><td align="left" valign="bottom">Med report</td><td align="left" valign="bottom">Nurs-Monit</td><td align="left" valign="bottom">Daily note</td><td align="left" valign="bottom">Image</td><td align="left" valign="bottom">Presc</td><td align="left" valign="bottom">Para-med</td><td align="left" valign="bottom">Admin</td><td align="left" valign="bottom">Patho</td><td align="left" valign="bottom">Other</td></tr></thead><tbody><tr><td align="left" valign="top">Smoking, mean (SD)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>pr</td><td align="left" valign="top">1.0 (N/A<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>)</td><td align="left" valign="top">0.95 (0.006)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td><td align="left" valign="top">0.93 (0.02)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.83 (0.236)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.96 (0.064)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ab</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.92 (0.01)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>fm</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.94 (0.01)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.89 (0.1)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>un</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.98 (0.004)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.93 (0.016)</td><td align="left" valign="top">0.97 (0.029)</td><td align="left" valign="top">0.96 (0.064)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top">Diab, mean (SD)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>pr</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.97 (0.012)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.97 (0.044)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ab</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.97 (0.02)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>un</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.99 (0.0)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.93 (0.115)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top">Hyper, mean (SD)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>pr</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.99 (0.002)</td><td align="left" valign="top">0 (N/A)</td><td align="left" valign="top">0.97 (0.011)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ab</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.84 (0.022)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>un</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.98 (0.004)</td><td align="left" valign="top">0.99 (0.008)</td><td align="left" valign="top">0.94 (0.037)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top">CHF<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup>, mean (SD)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>pr</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.92 (0.029)</td><td align="left" valign="top">0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ab</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.98 (0.02)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.8 (0)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>un</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.98 (0.002)</td><td align="left" valign="top">0.99 (0.009)</td><td align="left" valign="top">0.95 (0.014)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top">COPD<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup>, mean (SD)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>pr</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.99 (0.002)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.89 (0.192)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>un</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.99 (0.001)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.94 (0.079)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td></tr><tr><td align="left" valign="top">Fam, mean (SD)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>pr</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.55 (0.37)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>un</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">0.99 (0.003)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td><td align="left" valign="top">1.0 (N/A)</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>N/A: not applicable.</p></fn><fn id="table6fn2"><p><sup>b</sup>Not available.</p></fn><fn id="table6fn3"><p><sup>c</sup>CHF: congestive heart failure.</p></fn><fn id="table6fn4"><p><sup>d</sup>COPD: chronic obstructive pulmonary disease.</p></fn></table-wrap-foot></table-wrap><p>For 72 status&#x2013;document-type combinations, no sample was retrieved in validation sets, and performance cannot be discussed (marked with an &#x201C;-&#x201D;). For the remaining 115 status document-type combinations, we can divide results into 2 categories.</p><p>Those with comparable performance (less than 5% decrease) or improvement on specific documents (109/115, 94.8%) and those where a decrease is observed (6/115, 5.2%). In the first category, 79 instances show a score of 1.0 +/&#x2013; N/A, indicating that only one-fold contains content for evaluation. This perfect performance also suggests that few samples were available for evaluation. In the second category, 2 instances were only evaluated in one fold, and extractors missed the status (ie, for hypertension presence and heart failure presence in the nurse report). For the 4 others (smoking presence and diabetes indifference in paramedical report, absence of heart failure in daily note, and presence of COPD in administration note), a decrease is observed, but performance remains acceptable (&#x003E;80%). Examination of false positives and false negatives in these 4 cases suggests that most errors stem from the misleading narrative style of clinical notes. For example, in a paramedical report, &#x201C;OH chronique sevr&#x00E9; tabac syndrome d&#x00E9;pressif&#x201D; (Chronic alcohol use weaned tobacco use depressive disorder) was misclassified as former smoking because the boundary between alcohol withdrawal and smoking was unclear. Another sentence, &#x201C;des atcd cardio,HTA,DT2non trait&#x00E9;<italic>&#x201D;</italic> (A history of cardiac disease, untreated hypertension, and type 2 diabetes) was misclassified as absence of diabetes because the negation &#x201C;non&#x201D; was attached to the diabetes &#x201C;DT2&#x201D; rather than to the treatment &#x201C;trait&#x00E9;<italic>.</italic>&#x201D; The extractor also produces some persistent errors due to the medical complexity of the content, which is challenging even for specialist reviewers; for instance, a sentence in a daily note was misclassified as absent of heart failure because it contains this expression &#x201C;avec une fraction d'&#x00E9;jection ventriculaire conserv&#x00E9;e&#x201D; (with a preserved ventricular ejection fraction), which does not exclude entirely the possibility of having a heart failure. In conclusion, although the representation of entities is very sparse, we observe overall good robustness of the extractors depending on the type of document encountered. To go further, the number of annotated sentences would need to be increased.</p></sec><sec id="s3-5"><title>Comparison With Alternative Approaches</title><p><xref ref-type="fig" rid="figure3">Figure 3</xref> presents a comparison of the extractors developed using CamemBERT fine-tuning, rule-based methods, and LLM prompting (Mixtral-8&#x00D7;7B-v0.1). As a reminder, all extractors were evaluated on the same validation sets used for the 3-fold cross-validation of CamemBERT-based extractors. For conditions with ample annotated examples, such as smoking and heart failure, CamemBERT-based extractors achieved the highest performance (also confirmed by paired <italic>t</italic> test analysis; <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). In contrast, for conditions with limited training data, rule-based extractors performed comparably to or better than CamemBERT-based models, reflecting their robustness in low-data scenarios. This was particularly evident for family history of cancer, where CamemBERT-based extractors achieved an average macro <italic>F</italic>-score of 0.73 with a cross-validation SD of 17%, while the rule-based approach reached an average macro <italic>F</italic>-score of 0.94 with a substantially lower SD of 7.9%.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Comparison of the performance of extractors based on 3 different approaches: fine-tuned CamemBERT, (<bold>A</bold>) rule-based extraction and large language model prompt query for smoking, (<bold>B</bold>) diabetes, (<bold>C</bold>) hypertension, (<bold>D</bold>) heart failure, (<bold>E</bold>) chronic obstructive pulmonary disease, and (<bold>F</bold>) family history of cancer. The white number on each bar marks the average macro <italic>F</italic>-score percentage, and the number above each bar represents the SD (also percentage) calculated from cross-validation. The performance is evaluated by testing all extractors on the same validation datasets used during the 3-fold cross-validation of the 2nd iteration of the pipeline.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e77409_fig03.png"/></fig><p>Compared with CamemBERT-based and rule-based extractors, LLM-prompted extractors showed intermediate performance overall but were less efficient in our setting. Using Mixtral-8&#x00D7;7B-v0.1, sentence-level extraction required 12&#x2010;24 seconds per sentence and 35 GB of GPU memory, which is slower and more resource-intensive than CamemBERT (0.09&#x2010;0.12 s per sentence; 2 GB GPU).</p><p>These findings indicate that CamemBERT-based and rule-based methods are the most suitable approaches for our institutional resource constraints, and that no single extraction approach is universally optimal across all clinical scenarios.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Overview</title><p>This section examines the advantages and limitations of the MSEP pipeline within our institutional setting and explores how its design can support local development of clinical NLP extractors.</p></sec><sec id="s4-2"><title>Technical Advantages</title><sec id="s4-2-1"><title>Efficiency Gains for Clinical Text Annotation</title><p>The MSEP pipeline addresses challenges posed by the low density of medical-status information in large institutional corpora, which can hinder the efficient local development of information extraction systems. Its preannotation step helps prioritize sentences with potentially relevant information, and reframing extraction as sentence classification allows annotators to make quicker labeling decisions. Together, these components streamline dataset construction and reduce the operational burden of annotation.</p><p>Although our annotators were unavailable for extensive medical entity annotation, preventing a direct comparison of annotation speed in our institution, prior work has shown that sentence classification generally requires less annotation time and effort than word-level NER [<xref ref-type="bibr" rid="ref57">57</xref>-<xref ref-type="bibr" rid="ref60">60</xref>]. This is expected, as sentence classification involves assigning a single label to an entire sentence, whereas NER requires annotators to identify multiple entities within a sentence and decide on their boundaries, increasing the complexity and time required for annotation [<xref ref-type="bibr" rid="ref61">61</xref>]. This improvement in annotation speed is also supported by an empirical comparison with a previous study on NER efficiency [<xref ref-type="bibr" rid="ref20">20</xref>], as our manual sentence classification ranged from 1.2 to 2.9 seconds per sentence (or 2.23 to 4.25 s per sentence when considering only informative sentences), significantly faster than the 7.82 to 16.48 s reported for NER.</p><p>The sentence classification-based extractors also demonstrated faster inference speed than the traditional composite extractors (NER + qualifier). In our study, extractors of rule-based NER combined with EDS-NLP qualifiers were tested for smoking and family history of cancer during the preannotation step of the first iteration of the pipeline. The composite approach (0.17 s/sentence) was notably slower than our CamemBERT-based extractor (0.09&#x2010;0.12 s/sentence) and rules-only preannotation models (0.02&#x2010;0.04 s/sentence). Importantly, this efficiency gain did not compromise performance. Compared with recent medical status extractors, our pipeline achieved comparable or superior results. Our extractors reached <italic>F</italic>-scores of 99%, 95%, 96%, and 99% for diabetes, smoking, heart failure, and COPD, respectively, matching or exceeding performance metrics reported in recent literature [<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s4-2-2"><title>Hybrid, Configurable, and Locally Deployable Pipeline Architecture</title><p>The MSEP pipeline was designed as a flexible and configurable framework that enables the local deployment of hybrid extraction approaches across diverse institutional settings. The accompanying Python package (repository link in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) exposes all key components, such as preannotation models (D1), sentence filtering criteria (D2), cross-validation strategy (C2), and extractor choice, as configurable arguments. This modular design allows researchers to tailor each step of the workflow, including the choice of extraction approach, according to their resource constraints, characteristics of the targeted information, and institutional requirements. For instance, in our experiments, fine-tuned CamemBERT models produced satisfactory performance for most medical statuses, whereas rule-based extractors yielded more stable results for conditions with limited representation in the dataset. This means that the pipeline can be configured to favor rule-based extraction in low-example scenarios. Conversely, in settings with limited annotation resources or domain expertise, the pipeline can be configured to generate LLM prompt-based extractors, which can be readily improved by providing more detailed instructions and examples in the prompts [<xref ref-type="bibr" rid="ref62">62</xref>].</p><p>Our institutional experiments further illustrated the practical feasibility of deploying MSEP within a local clinical setting. First, the full workflow, including preannotation, manual validation, model training, and validation, was applied independently to 6 medical conditions and completed within 99 hours in total (approximately 16.5 h per condition). This was achieved with 2 annotators and a single NVIDIA A100 40 GB GPU, a setup feasible for many medical research institutions. Second, the models were trained and evaluated on a heterogeneous set of clinical documents (including hospital discharge summaries, consultation notes, and reports from multiple specialties). The comparable performance observed across these diverse sources indicates that the workflow can be operationalized effectively within diverse documentation practices at a single institution.</p><p>While our current evaluation remains monocentric, MSEP&#x2019;s packaging and explicit parameterization are designed to facilitate reproducible deployment in other institutions. Applying identical configurations on comparable data would be expected to yield consistent results, and extending the approach to multicentric validation represents an important next step toward confirming generalizability and cross-site reproducibility.</p></sec><sec id="s4-2-3"><title>Enhanced Monitoring and Optimization Mechanic</title><p>The pipeline provides means for evaluating and improving each of its mean processes (data qualification, manual annotation and correction, model training, and cross-validation). Preannotation (D1) allows estimation and supervision on corpus&#x2019; sample density and medical status&#x2019; distribution, which can be adjusted by sentence filtering (D2); interannotator agreement calculation (M2) and analysis of annotation disagreements (M3) not only revise the quality of annotated sentences, but also reveal problems in annotation guidelines and improve them accordingly (M4); extractors&#x2019; assessment and comparison (C4) enables comprehensive model evaluation and informs decisions on the need for and direction of further improvements.</p><p>The iterative nature of our pipeline provides a means for monitoring and optimizing dataset size. By accumulating annotated data gradually and evaluating extractors with cross-validation at each iteration, we can determine when to halt manual annotation, limiting resource expenditure to reasonable levels. We also considered active learning as an alternative approach, where the model selects samples difficult to predict for prioritized manual annotation and updates itself with the received annotation [<xref ref-type="bibr" rid="ref63">63</xref>]. In theory, this approach can maximize annotation efficiency by prioritizing the most informative samples for model improvement. However, we ultimately rejected it because model updates were disproportionately influenced by status categories with the most samples, and uncertain sample distribution prevented effective loss function weighting. Nevertheless, active learning remains valuable when balanced datasets are available or when sample balancing can be implemented during data acquisition [<xref ref-type="bibr" rid="ref64">64</xref>].</p></sec></sec><sec id="s4-3"><title>Importance for Clinical Research</title><sec id="s4-3-1"><title>Response to Clinical Text-Mining Challenges</title><p>Clinical text mining remains challenging due to the unstructured and context-dependent nature of medical narratives. Our evaluation shows that sentence-level classification, as implemented in MSEP, effectively addresses several of these challenges in practice. The approach demonstrated high accuracy across heterogeneous document types, confirming that the model captures contextual cues, such as negation or temporality, without relying on context qualifiers. Compared with traditional NER-based systems, which require token-level annotation and post hoc context detection, the sentence classification strategy substantially reduces annotation time and complexity. The high <italic>F</italic>-scores obtained for conditions such as smoking and diabetes illustrate that this formulation aligns well with the way medical status is expressed, which is typically at the sentence level rather than through isolated entities.</p><p>To our knowledge, there is currently no existing pipeline in the clinical NLP literature that applies sentence classification for medical status extraction. While sentence classification has been explored in other domains, such as sentiment analysis or social media mining, its application to clinical narratives remains underdeveloped and has not yet been systematically assessed for this purpose.</p></sec><sec id="s4-3-2"><title>Easy Implementation as a Research Tool for Clinical Studies</title><p>Clinical text-mining efforts are often constrained by privacy regulations and institutional heterogeneity that prevent direct data or model sharing across sites [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref36">36</xref>-<xref ref-type="bibr" rid="ref38">38</xref>]. The MSEP framework was designed to address these issues by enabling efficient local dataset construction, providing flexible configuration of extraction approaches according to institutional constraints, and supporting controlled resource expenditure through performance-monitored iteration.</p><p>Beyond initial model development, MSEP demonstrated utility as an upstream component in broader research workflows that can support more specialized downstream tasks. When applied first, MSEP extractors can identify only the documents or sentences that contain the targeted medical status, allowing researchers to exclude irrelevant material entirely and thus avoid expending effort on processing noninformative documents. In practice, this functionality has already been demonstrated within our institution, where a smoking-status classifier created by MSEP was applied to an independent corpus of 300 clinical reports, efficiently filtering for documents concerning patients who were smokers and achieving an <italic>F</italic>-score of 99.6% for detecting smoking-related content. Downstream tasks, such as extracting smoking duration or tobacco brand, can then be focused solely on these filtered documents, reducing workload and improving overall efficiency.</p></sec></sec><sec id="s4-4"><title>Current Limitations</title><p>Our study has several limitations worth noting. First, due to the availability of medical experts, we did not conduct a direct comparison between sentence classification and NER. Instead, we relied on empirical observations and general consensus to suggest that sentence classification offers greater annotation efficiency. While the improvement in annotation speed appears supported, the impact on annotation quality remains to be verified.</p><p>Second, clinical document type is a factor that can influence the interpretation of medical status occurrence in text, which was not fully addressed in our study. For instance, a mention of &#x201C;hypertension&#x201D; in a medical report, especially within the &#x201C;Past Medical History&#x201D; section, typically refers to a chronic condition. In contrast, if hypertension is noted in daily progress notes, it is more likely to reflect an acute episode under current observation or management. Therefore, when applying MSEP to clinical documents from a CDW, it may be necessary to incorporate a document selection step during data collection (P1), not only to include relevant document types, but also to focus on specific sections within documents, such as the &#x201C;Past Medical History&#x201D; section, when these are more likely to align with the study objectives.</p><p>Third, the construction and usage of the datasets are subject to several sources of bias. Preannotation and filtering were used to increase the density of informative samples in the dataset, but mislabeling by preannotation rules could have led to the exclusion of relevant sentences (eg, those labeled as &#x201C;unknown&#x201D;). To assess this risk, 10% of our dataset consisted of nonpreannotated sentences, which were used to evaluate potential bias introduced by the preannotation rules. While most conditions showed limited distortion, a notable shift was observed for the present status of heart failure, underscoring that preannotation can influence class distributions. A further concern is the risk of circular bias, which may be induced by using sentences initially selected by rules to evaluate those same rule-based extractors. To limit this risk, we constructed the annotated corpus by combining the preannotations from all descriptors rather than relying on a single rule-based extractor per condition, resulting in a relatively small proportion of validation sentences originating from the same rule set (eg, 3.4%&#x2010;3.6% for COPD). Evaluating extractors is a complex task in such large datasets. Ideally, most of the dataset would need to be annotated to determine actual performance. However, this task requires too much annotation time for most noninformative sentences. Our approach to selection by multiple preannotations allows us to limit this time while verifying that little bias appears to be induced (eg, confirmed using sentences without preannotation). Finally, although identical sentences were deduplicated before cross-validation, dataset partitioning was not stratified by Patient_ID. As a result, sentences about the same patient could appear across validation folds, introducing a potential risk of data leakage. The proportion of validation sentences with patient overlap ranged from 3% to 8%. To assess the impact of this overlap, we revalidated the models after removing these sentences from the validation sets. Performance changes were minimal (less than 1% of drop for macro <italic>F</italic>-score of each cross-validation fold), suggesting that patient-level overlap did not artificially inflate extractor performance. Moving forward, these extractors should be used on other datasets while keeping in mind their potential limitations.</p><p>Fourth, the evaluation of certain medical statuses was limited by their very low prevalence in the corpus. For COPD and family history of cancer, the Absent status was represented by only a single instance, preventing meaningful assessment of the extractor&#x2019;s ability to distinguish absent from unknown. Consequently, both extractors effectively functioned as binary classifiers (present vs unknown). This limitation reflects typical documentation practices: the absence of a specific condition (eg, COPD) is rarely stated explicitly, as doing so would require exhaustively listing all conditions that are not present; the issue is somewhat different for family history of cancer, where the absence of this condition is generally more informative (&#x201C;no family history of cancer&#x201D;) and therefore more likely to be mentioned, although such mentions remained infrequent in our dataset.</p><p>Finally, decoder-only LLMs were not fully used due to initial infrastructure limitations. Our computing environment, a secure server at the University Hospital of Rennes with 112 Intel(R) Xeon(R) Gold 6258R CPU cores and limited access to an NVIDIA A100 40 GB GPU, was insufficient for efficient, prompt-based inference (12&#x2010;24 s/sentence when performing medical status extraction) or fine-tuning (requiring &#x2265;64 GB VRAM).</p></sec><sec id="s4-5"><title>Perspectives and Future Applications</title><p>Beyond its current implementation, the pipeline can be extended to broader medical information extraction tasks. Ongoing work includes detecting additional statuses relevant to bladder cancer [<xref ref-type="bibr" rid="ref3">3</xref>] and extracting finer-grained information such as smoking duration and cigarette type. By first classifying sentences containing smoking mentions, the pipeline narrows the scope for subsequent extractions, improving efficiency.</p><p>Future studies will also address current limitations. First, a direct comparison of annotation speed and quality between sentence classification and NER will provide stronger evidence for our methodological choice. Second, it is well known that medical documents contain information, structures, and sometimes vocabulary that are unique to them. For our specific study, we have demonstrated a certain robustness of extractors to these phenomena. To reinforce this finding, more instances will need to be annotated, bearing in mind that we are working in a very sparse context. Third, synthetic datasets will be explored to mitigate preannotation bias, balance category distributions, and reduce reliance on sensitive records. They may also improve extractor performance on medical conditions with scarce samples (eg, family history of cancer). Prior studies have investigated GANs for generating synthetic structured EHRs [<xref ref-type="bibr" rid="ref40">40</xref>]; while ensuring both utility and confidentiality remain challenging, synthetic data offer a promising path forward for clinical NLP.</p><p>Finally, the Ollama framework [<xref ref-type="bibr" rid="ref65">65</xref>] will be used for a better exploration of LLM prompt potential. This framework enables accelerated local inference, improving medical status annotation speed with Mixtral-8x7B-v0.1 to 0.2 seconds per sentence. Although LLM prompting has not yet reached the same level of performance as CamemBERT fine-tuning for certain conditions, further exploration of prompt design and fine-tuning strategies may be pursued as infrastructure improves. However, it is important to keep in mind the computational and ecological cost of such a strategy when rule-based and CamemBERT-based approaches can provide satisfactory results.</p><p>Together, these perspectives aim to strengthen the robustness, reproducibility, and applicability of the pipeline across diverse clinical research settings.</p></sec><sec id="s4-6"><title>Conclusions</title><p>We introduced MSEP, a modular and hybrid methodological framework for extracting medical status information from unstructured clinical text using sentence classification. In our institutional case study, the pipeline substantially reduced annotation effort: sentence-level labeling with preannotation required only 1.2&#x2010;2.9 seconds per sentence. MSEP successfully supported the implementation of 3 extraction approaches: rule-based methods, fine-tuned CamemBERT model, and LLM prompting, each suited to different data and resource scenarios. Fine-tuned CamemBERT model achieved high performance for conditions with sufficient training examples (macro <italic>F</italic>-score&#x003E;94%), whereas rule-based methods provided more stable results for sparsely represented conditions. Together, these findings highlight MSEP&#x2019;s value as a research tool that accelerates local dataset creation and enables flexible deployment of extraction systems. Importantly, all components of the pipeline are distributed as a ready-to-use Python package.</p></sec></sec></body><back><ack><p>The authors thank VitaDX company for their collaborative contributions to this research. The authors declare the use of generative artificial intelligence (GenAI) in the research and writing process. In accordance with the GAIDeT taxonomy (2025), GenAI tools were used under full human supervision for reformatting tasks. The GenAI tool used was ChatGPT-5. Responsibility for the content and integrity of the final manuscript rests entirely with the authors. GenAI tools are not listed as authors and do not bear responsibility for the final outcomes. This declaration is submitted under the collective responsibility of the authors.</p></ack><notes><sec><title>Funding</title><p>This work received support from VitaDX (Rennes, France), which contributed to the project as an industrial partner. The funder had no influence on the study design, analysis, interpretation, or manuscript preparation.</p></sec><sec><title>Data Availability</title><p>The data used in this work contains pseudo-identifiable information and cannot be shared directly. However, a collaboration with the Clinical Data Center of Rennes Hospital can be considered for data exploitation while adhering to legal and regulatory frameworks. Please contact the corresponding author for more information. The Medical Status Extraction Pipeline is available as an open-source Python package (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> contains the repository link).</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CDW</term><def><p>clinical data warehouse</p></def></def-item><def-item><term id="abb2">COPD</term><def><p>chronic obstructive pulmonary disease</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">MSEP</term><def><p>Medical Status Extraction Pipeline</p></def></def-item><def-item><term id="abb5">n2c2</term><def><p>National NLP Clinical Challenges</p></def></def-item><def-item><term id="abb6">NER</term><def><p>named entity recognition</p></def></def-item><def-item><term id="abb7">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb8">PS</term><def><p>preannotated filtered sentences</p></def></def-item><def-item><term id="abb9">US</term><def><p>unfiltered sentences</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Syed</surname><given-names>M</given-names> </name><name name-style="western"><surname>Syed</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Clinical data warehousing: a scoping review</article-title><source>J Soc Clin Data Manag</source><year>2024</year><volume>4</volume><issue>2</issue><fpage>8</fpage><pub-id pub-id-type="doi">10.47912/jscdm.320</pub-id><pub-id pub-id-type="medline">41602775</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bazoge</surname><given-names>A</given-names> </name><name name-style="western"><surname>Morin</surname><given-names>E</given-names> </name><name name-style="western"><surname>Daille</surname><given-names>B</given-names> </name><name name-style="western"><surname>Gourraud</surname><given-names>PA</given-names> </name></person-group><article-title>Applying natural language processing to textual data from clinical data warehouses: systematic review</article-title><source>JMIR Med Inform</source><year>2023</year><month>12</month><day>15</day><volume>11</volume><fpage>e42477</fpage><pub-id pub-id-type="doi">10.2196/42477</pub-id><pub-id pub-id-type="medline">38100200</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cabon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Brihi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fezzani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pierre-Jean</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cuggia</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bouzill&#x00E9;</surname><given-names>G</given-names> </name></person-group><article-title>Combining a risk factor score designed from electronic health records with a digital cytology image scoring system to improve bladder cancer detection: proof-of-concept study</article-title><source>J Med Internet Res</source><year>2025</year><month>01</month><day>22</day><volume>27</volume><fpage>e56946</fpage><pub-id pub-id-type="doi">10.2196/56946</pub-id><pub-id pub-id-type="medline">39841985</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pierre-Jean</surname><given-names>M</given-names> </name><name name-style="western"><surname>Marut</surname><given-names>B</given-names> </name><name name-style="western"><surname>Curtis</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Phenotyping of heart failure with preserved ejection faction using electronic health records and echocardiography</article-title><source>Eur Heart J Open</source><year>2024</year><month>01</month><volume>4</volume><issue>1</issue><fpage>oead133</fpage><pub-id pub-id-type="doi">10.1093/ehjopen/oead133</pub-id><pub-id pub-id-type="medline">38196848</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pardo</surname><given-names>I</given-names> </name><name name-style="western"><surname>Pierre-Jean</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bouzill&#x00E9;</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Safety of subcutaneous versus intravenous ceftriaxone administration in older patients: a retrospective study</article-title><source>J Am Geriatr Soc</source><year>2024</year><month>04</month><volume>72</volume><issue>4</issue><fpage>1060</fpage><lpage>1069</lpage><pub-id pub-id-type="doi">10.1111/jgs.18786</pub-id><pub-id pub-id-type="medline">38348519</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Family history extraction from synthetic clinical narratives using natural language processing: overview and evaluation of a challenge data set and solutions for the 2019 national NLP clinical challenges (n2c2)/open health natural language processing (OHNLP) competition</article-title><source>JMIR Med Inform</source><year>2021</year><month>01</month><day>27</day><volume>9</volume><issue>1</issue><fpage>e24008</fpage><pub-id pub-id-type="doi">10.2196/24008</pub-id><pub-id pub-id-type="medline">33502329</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weeks</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Beck</surname><given-names>C</given-names> </name><name name-style="western"><surname>McNeer</surname><given-names>E</given-names> </name><etal/></person-group><article-title>medExtractR: a targeted, customizable approach to medication extraction from electronic health records</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>03</month><day>1</day><volume>27</volume><issue>3</issue><fpage>407</fpage><lpage>418</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocz207</pub-id><pub-id pub-id-type="medline">31943012</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lependu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Iyer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Udell</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name></person-group><article-title>Analyzing patterns of drug use in clinical notes for patient safety</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2012</year><volume>2012</volume><fpage>63</fpage><lpage>70</lpage><pub-id pub-id-type="medline">22779054</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jonnalagadda</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gonzalez</surname><given-names>G</given-names> </name></person-group><article-title>Enhancing clinical concept extraction with distributional semantics</article-title><source>J Biomed Inform</source><year>2012</year><month>02</month><volume>45</volume><issue>1</issue><fpage>129</fpage><lpage>140</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2011.10.007</pub-id><pub-id pub-id-type="medline">22085698</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lerner</surname><given-names>I</given-names> </name><name name-style="western"><surname>Jouffroy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Burgun</surname><given-names>A</given-names> </name></person-group><article-title>Learning the grammar of drug prescription: recurrent neural network grammars for medication information extraction in clinical texts</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 24, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2004.11622</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hahn</surname><given-names>U</given-names> </name><name name-style="western"><surname>Oleynik</surname><given-names>M</given-names> </name></person-group><article-title>Medical information extraction in the age of deep learning</article-title><source>Yearb Med Inform</source><year>2020</year><month>08</month><volume>29</volume><issue>1</issue><fpage>208</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.1055/s-0040-1702001</pub-id><pub-id pub-id-type="medline">32823318</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Musen</surname><given-names>MA</given-names></name><name name-style="western"><surname>Friedman</surname><given-names>C</given-names></name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Kulikowski</surname><given-names>CA</given-names></name></person-group><article-title>Semantic text parsing for patient records</article-title><source>Medical Informatics: Knowledge Management and Data Mining in Biomedicine</source><year>2006</year><publisher-name>Springer</publisher-name><fpage>423</fpage><lpage>448</lpage><pub-id pub-id-type="doi">10.1007/0-387-25739-X_15</pub-id><pub-id pub-id-type="other">9780387257396</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Oxford University Press</collab></person-group><article-title>Medical status - oxford reference</article-title><source>Oxford Reference</source><year>2011</year><access-date>2025-02-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.oxfordreference.com/display/10.1093/oi/%20authority.20110803100146893">https://www.oxfordreference.com/display/10.1093/oi/ authority.20110803100146893</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Si</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>K</given-names> </name></person-group><article-title>Enhancing clinical concept extraction with contextual embeddings</article-title><source>J Am Med Inform Assoc</source><year>2019</year><month>11</month><day>1</day><volume>26</volume><issue>11</issue><fpage>1297</fpage><lpage>1304</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocz096</pub-id><pub-id pub-id-type="medline">31265066</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bhatia</surname><given-names>P</given-names> </name><name name-style="western"><surname>Celikkaya</surname><given-names>B</given-names> </name><name name-style="western"><surname>Khalilia</surname><given-names>M</given-names> </name></person-group><article-title>Joint entity extraction and assertion detection for clinical text</article-title><conf-name>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 28 to Aug 2, 2019</conf-date><conf-loc>Florence, Italy</conf-loc><publisher-name>Association for Computational Linguistics</publisher-name><fpage>954</fpage><lpage>959</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/P19-1">https://aclanthology.org/P19-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/P19-1091</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Petit-Jean</surname><given-names>T</given-names> </name><name name-style="western"><surname>G&#x00E9;rardin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Berthelot</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Collaborative and privacy-enhancing workflows on a clinical data warehouse: an example developing natural language processing pipelines to detect medical conditions</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>05</month><day>20</day><volume>31</volume><issue>6</issue><fpage>1280</fpage><lpage>1290</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae069</pub-id><pub-id pub-id-type="medline">38573195</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Neuraz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lerner</surname><given-names>I</given-names> </name><name name-style="western"><surname>Digan</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Natural language processing for rapid response to emergent diseases: case study of calcium channel blockers and hypertension in the COVID-19 pandemic</article-title><source>J Med Internet Res</source><year>2020</year><volume>22</volume><issue>8</issue><fpage>e20773</fpage><pub-id pub-id-type="doi">10.2196/20773</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>I</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Goldwasser</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Neural natural language processing for unstructured data in electronic health records: a review</article-title><source>Comput Sci Rev</source><year>2022</year><month>11</month><volume>46</volume><fpage>100511</fpage><pub-id pub-id-type="doi">10.1016/j.cosrev.2022.100511</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sedlakova</surname><given-names>J</given-names> </name><name name-style="western"><surname>Daniore</surname><given-names>P</given-names> </name><name name-style="western"><surname>Horn Wintsch</surname><given-names>A</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Sarmiento</surname><given-names>RF</given-names> </name></person-group><article-title>Challenges and best practices for digital unstructured data enrichment in health research: a systematic narrative review</article-title><source>PLOS Digit Health</source><year>2023</year><month>10</month><volume>2</volume><issue>10</issue><fpage>e0000347</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000347</pub-id><pub-id pub-id-type="medline">37819910</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Franklin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name></person-group><article-title>Clinical text annotation - what factors are associated with the cost of time?</article-title><source>AMIA Annu Symp Proc</source><year>2018</year><volume>2018</volume><fpage>1552</fpage><lpage>1560</lpage><pub-id pub-id-type="medline">30815201</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moscato</surname><given-names>V</given-names> </name><name name-style="western"><surname>Postiglione</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sperl&#x00EC;</surname><given-names>G</given-names> </name><name name-style="western"><surname>Vignali</surname><given-names>A</given-names> </name></person-group><article-title>ALDANER: active learning based data augmentation for named entity recognition</article-title><source>Knowl Based Syst</source><year>2024</year><month>12</month><volume>305</volume><fpage>112682</fpage><pub-id pub-id-type="doi">10.1016/j.knosys.2024.112682</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsuruoka</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tsujii</surname><given-names>JI</given-names> </name><name name-style="western"><surname>Ananiadou</surname><given-names>S</given-names> </name></person-group><article-title>Accelerating the annotation of sparse named entities by dynamic sentence selection</article-title><source>BMC Bioinformatics</source><year>2008</year><month>11</month><day>19</day><volume>9 Suppl 11</volume><issue>Suppl 11</issue><fpage>1</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1186/1471-2105-9-S11-S8</pub-id><pub-id pub-id-type="medline">19025694</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Settles</surname><given-names>B</given-names> </name><name name-style="western"><surname>Craven</surname><given-names>M</given-names> </name><name name-style="western"><surname>Friedland</surname><given-names>L</given-names> </name></person-group><article-title>Active learning with real annotation costs</article-title><conf-name>Proceedings of the NIPS Workshop on Cost-Sensitive Learning</conf-name><conf-date>Dec 13, 2008</conf-date></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Long</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Shao</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Deep neural network with embedding fusion for chinese named entity recognition</article-title><source>ACM Trans Asian Low-Resour Lang Inf Process</source><year>2023</year><month>03</month><day>31</day><volume>22</volume><issue>3</issue><fpage>1</fpage><lpage>16</lpage><pub-id pub-id-type="doi">10.1145/3570328</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kalyan</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Rajasekharan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sangeetha</surname><given-names>S</given-names> </name></person-group><article-title>AMMUS: a survey of transformer-based pretrained models in natural language processing</article-title><source>arXiv</source><comment>Preprint posted online on 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2108.05542</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><conf-name>Proceedings of the 2019 Conference of the North</conf-name><conf-date>Jun 2-7, 2019</conf-date><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Muller</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ortiz Su&#x00E1;rez</surname><given-names>PJ</given-names> </name><etal/></person-group><article-title>CamemBERT: a tasty french language model</article-title><conf-name>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 6-8, 2020</conf-date><pub-id pub-id-type="doi">10.18653/v1/2020.acl-main.645</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Grie&#x00DF;haber</surname><given-names>D</given-names> </name><name name-style="western"><surname>Maucher</surname><given-names>J</given-names> </name><name name-style="western"><surname>Vu</surname><given-names>NT</given-names> </name></person-group><article-title>Fine-tuning BERT for low-resource natural language understanding via active learning</article-title><conf-name>Proceedings of the 28th International Conference on Computational Linguistics</conf-name><conf-date>Dec 8-13, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/2020.coling-main">https://www.aclweb.org/anthology/2020.coling-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2020.coling-main.100</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>OpenAI</collab></person-group><article-title>Introducing chatgpt</article-title><source>OpenAI</source><year>2022</year><access-date>2025-02-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/blog/chatgpt">https://openai.com/blog/chatgpt</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Mistral AI</collab></person-group><article-title>Mistral 7B: a new paradigm for open-weight language models</article-title><source>Mistral AI</source><year>2023</year><access-date>2025-02-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://mistral.ai/news/announcing-mistral-7b/">https://mistral.ai/news/announcing-mistral-7b/</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><article-title>LLMs accelerate annotation for medical information extraction</article-title><conf-name>Proceedings of the 3rd Machine Learning for Health Symposium</conf-name><conf-date>Dec 10, 2023</conf-date></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name></person-group><article-title>Prompt engineering as an important emerging skill for medical professionals: tutorial</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>4</day><volume>25</volume><fpage>e50638</fpage><pub-id pub-id-type="doi">10.2196/50638</pub-id><pub-id pub-id-type="medline">37792434</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Posada</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rueckert</surname><given-names>D</given-names> </name><name name-style="western"><surname>Meissen</surname><given-names>F</given-names> </name><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>P</given-names> </name></person-group><article-title>Evaluation of language models in the medical context under resource-constrained settings</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 24, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.16611</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ademola</surname><given-names>A</given-names> </name><name name-style="western"><surname>George</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mapp</surname><given-names>G</given-names> </name></person-group><article-title>Addressing the interoperability of electronic health records: the technical and semantic interoperability, preserving privacy and security framework</article-title><source>ASI</source><year>2024</year><volume>7</volume><issue>6</issue><fpage>116</fpage><pub-id pub-id-type="doi">10.3390/asi7060116</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zuo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Information extraction from clinical notes: are we ready to switch to large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 15, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2411.10020</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ghosheh</surname><given-names>GO</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>T</given-names> </name></person-group><article-title>A survey of generative adversarial networks for synthesizing structured electronic health records</article-title><source>ACM Comput Surv</source><year>2024</year><month>06</month><day>30</day><volume>56</volume><issue>6</issue><fpage>1</fpage><lpage>34</lpage><pub-id pub-id-type="doi">10.1145/3636424</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hier</surname><given-names>DB</given-names> </name><name name-style="western"><surname>Carrithers</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Do</surname><given-names>TS</given-names> </name><name name-style="western"><surname>Obafemi-Ajayi</surname><given-names>T</given-names> </name></person-group><article-title>Efficient standardization of clinical notes using large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 31, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2501.00644</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lahiri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shukla</surname><given-names>S</given-names> </name><name name-style="western"><surname>Stear</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Benchmarking transformer embedding models for biomedical terminology standardization</article-title><source>Mach Learn Appl</source><year>2025</year><month>09</month><volume>21</volume><fpage>100683</fpage><pub-id pub-id-type="doi">10.1016/j.mlwa.2025.100683</pub-id><pub-id pub-id-type="medline">40718094</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sui</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name></person-group><article-title>Privacy-preserving federated learning framework for multi-source electronic health records prognosis prediction</article-title><source>Sensors (Basel)</source><year>2025</year><month>09</month><volume>25</volume><issue>8</issue><fpage>2374</fpage><pub-id pub-id-type="doi">10.3390/s25082374</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Fowl</surname><given-names>L</given-names> </name><name name-style="western"><surname>Geiping</surname><given-names>J</given-names> </name><name name-style="western"><surname>Reich</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Decepticons: corrupted transformers breach privacy in federated learning for language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 29, 2022</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2201.12675</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Villena</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bravo-Marquez</surname><given-names>F</given-names> </name><name name-style="western"><surname>Dunstan</surname><given-names>J</given-names> </name></person-group><article-title>NLP modeling recommendations for restricted data availability in clinical settings</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>03</month><day>7</day><volume>25</volume><issue>1</issue><fpage>116</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-02948-2</pub-id><pub-id pub-id-type="medline">40055634</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><name name-style="western"><surname>He</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Clinical concept extraction: a methodology review</article-title><source>J Biomed Inform</source><year>2020</year><month>09</month><volume>109</volume><fpage>103526</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2020.103526</pub-id><pub-id pub-id-type="medline">32768446</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beaney</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Alaa</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Comparing natural language processing representations of coded disease sequences for prediction in electronic health records</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>06</month><day>20</day><volume>31</volume><issue>7</issue><fpage>1451</fpage><lpage>1462</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae091</pub-id><pub-id pub-id-type="medline">38719204</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madec</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bouzill&#x00E9;</surname><given-names>G</given-names> </name><name name-style="western"><surname>Riou</surname><given-names>C</given-names> </name><etal/></person-group><article-title>eHOP clinical data warehouse: from a prototype to the creation of an inter-regional clinical data centers network</article-title><source>Stud Health Technol Inform</source><year>2019</year><month>08</month><day>21</day><volume>264</volume><fpage>1536</fpage><lpage>1537</lpage><pub-id pub-id-type="doi">10.3233/SHTI190522</pub-id><pub-id pub-id-type="medline">31438219</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Karanikolas</surname><given-names>N&#x039D;</given-names> </name><name name-style="western"><surname>Manga</surname><given-names>E</given-names> </name><name name-style="western"><surname>Samaridi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Stergiopoulos</surname><given-names>V</given-names> </name><name name-style="western"><surname>Tousidou</surname><given-names>E</given-names> </name><name name-style="western"><surname>Vassilakopoulos</surname><given-names>M</given-names> </name></person-group><article-title>Strengths and weaknesses of LLM-based and rule-based NLP technologies and their potential synergies</article-title><source>Electronics (Basel)</source><year>2025</year><month>07</month><day>31</day><volume>14</volume><issue>15</issue><fpage>3064</fpage><pub-id pub-id-type="doi">10.3390/electronics14153064</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jupin-Delevaux</surname><given-names>&#x00C9;</given-names> </name><name name-style="western"><surname>Djahnine</surname><given-names>A</given-names> </name><name name-style="western"><surname>Talbot</surname><given-names>F</given-names> </name><etal/></person-group><article-title>BERT-based natural language processing analysis of French CT reports: application to the measurement of the positivity rate for pulmonary embolism</article-title><source>Res Diagn Interv Imaging</source><year>2023</year><month>06</month><volume>6</volume><fpage>100027</fpage><pub-id pub-id-type="doi">10.1016/j.redii.2023.100027</pub-id><pub-id pub-id-type="medline">39077547</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>MacLean</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cavallucci</surname><given-names>D</given-names> </name></person-group><article-title>Assessing fine-tuned NER models with limited data in French: automating detection of new technologies, technological domains, and startup names in renewable energy</article-title><source>MAKE</source><year>2024</year><month>08</month><day>27</day><volume>6</volume><issue>3</issue><fpage>1953</fpage><lpage>1968</lpage><pub-id pub-id-type="doi">10.3390/make6030096</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touchent</surname><given-names>R</given-names> </name><name name-style="western"><surname>Romary</surname><given-names>L</given-names> </name><name name-style="western"><surname>Clergerie</surname><given-names>E</given-names> </name></person-group><article-title>CamemBERT-bio: leveraging continual pre-training for cost-effective models on french biomedical data</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 27, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2306.15550</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Baldini</surname><given-names>I</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>D</given-names> </name><name name-style="western"><surname>Natesan Ramamurthy</surname><given-names>K</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yurochkin</surname><given-names>M</given-names> </name></person-group><article-title>Your fairness may vary: pretrained language model fairness in toxic text classification</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>May 22-27, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.findings-acl">https://aclanthology.org/2022.findings-acl</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2022.findings-acl.176</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hendrycks</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>E</given-names> </name><name name-style="western"><surname>Dziedzic</surname><given-names>A</given-names> </name><name name-style="western"><surname>Krishnan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Song</surname><given-names>D</given-names> </name></person-group><article-title>Pretrained transformers improve out-of-distribution robustness</article-title><conf-name>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 5-10, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/2020.acl-main">https://www.aclweb.org/anthology/2020.acl-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2020.acl-main.244</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Roux</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Savary</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bamford</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Mixtral of experts</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 8, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.04088</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Honnibal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Montani</surname><given-names>I</given-names> </name></person-group><article-title>SpaCy 2: natural language understanding with bloom embeddings, convolutional neural networks and incremental parsing</article-title><source>Sentometrics Research</source><year>2017</year><access-date>2025-02-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://sentometrics-research.com/publication/72/">https://sentometrics-research.com/publication/72/</ext-link></comment></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Wajsburt</surname><given-names>P</given-names> </name><name name-style="western"><surname>Petit-Jean</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dura</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jean</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bey</surname><given-names>R</given-names> </name></person-group><article-title>EDS-NLP: efficient information extraction from french clinical notes</article-title><source>GitHub</source><access-date>2025-04-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://aphp.github.io/edsnlp">http://aphp.github.io/edsnlp</ext-link></comment></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="web"><article-title>Explosion</article-title><source>Prodigy</source><year>2022</year><access-date>2025-04-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://prodi.gy/">https://prodi.gy/</ext-link></comment></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fleiss</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>The equivalence of weighted kappa and the intraclass correlation coefficient as measures of reliability</article-title><source>Educ Psychol Meas</source><year>1973</year><month>10</month><volume>33</volume><issue>3</issue><fpage>613</fpage><lpage>619</lpage><pub-id pub-id-type="doi">10.1177/001316447303300309</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Poiron</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cabon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cuggia</surname><given-names>M</given-names> </name></person-group><article-title>How trueness of clinical decision support systems based on machine learning is assessed?</article-title><source>Stud Health Technol Inform</source><year>2024</year><month>08</month><day>22</day><volume>316</volume><fpage>813</fpage><lpage>817</lpage><pub-id pub-id-type="doi">10.3233/SHTI240535</pub-id><pub-id pub-id-type="medline">39176916</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Komiya</surname><given-names>K</given-names> </name><name name-style="western"><surname>Suzuki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Iwakura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sasaki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shinnou</surname><given-names>H</given-names> </name></person-group><article-title>Comparison of methods to annotate named entity corpora</article-title><source>ACM Trans Asian Low-Resour Lang Inf Process</source><year>2018</year><month>12</month><day>31</day><volume>17</volume><issue>4</issue><fpage>1</fpage><lpage>16</lpage><pub-id pub-id-type="doi">10.1145/3218820</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Neves</surname><given-names>M</given-names> </name><name name-style="western"><surname>&#x0160;eva</surname><given-names>J</given-names> </name></person-group><article-title>An extensive review of tools for manual annotation of documents</article-title><source>Brief Bioinform</source><year>2021</year><month>01</month><day>18</day><volume>22</volume><issue>1</issue><fpage>146</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1093/bib/bbz130</pub-id><pub-id pub-id-type="medline">31838514</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Galileo AI</collab></person-group><article-title>What is NER and why it&#x2019;s hard to get right</article-title><source>Galileo AI</source><access-date>2025-04-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.galileo.ai/blog/what-is-ner-and-why-it-s-hard-to-get-right">https://www.galileo.ai/blog/what-is-ner-and-why-it-s-hard-to-get-right</ext-link></comment></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Explosion AI</collab></person-group><article-title>Named entity recognition</article-title><source>Prodigy</source><access-date>2025-04-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://prodi.gy/docs/named-entity-recognition">https://prodi.gy/docs/named-entity-recognition</ext-link></comment></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Herman Bernardim Andrade</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yada</surname><given-names>S</given-names> </name><name name-style="western"><surname>Aramaki</surname><given-names>E</given-names> </name></person-group><article-title>Is boundary annotation necessary? evaluating boundary-free approaches to improve clinical named entity annotation efficiency: case study</article-title><source>JMIR Med Inform</source><year>2024</year><volume>12</volume><fpage>e59680</fpage><pub-id pub-id-type="doi">10.2196/59680</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Improving large language models for clinical named entity recognition via prompt engineering</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1812</fpage><lpage>1820</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad259</pub-id><pub-id pub-id-type="medline">38281112</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Baldridge</surname><given-names>J</given-names> </name><name name-style="western"><surname>Osborne</surname><given-names>M</given-names> </name></person-group><article-title>Active learning and the total cost of annotation</article-title><conf-name>Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>2004</conf-date></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Aggarwal</surname><given-names>U</given-names> </name><name name-style="western"><surname>Popescu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hudelot</surname><given-names>C</given-names> </name></person-group><article-title>Active learning for imbalanced datasets</article-title><conf-name>2020 IEEE Winter Conference on Applications of Computer Vision (WACV)</conf-name><conf-date>Mar 1-5, 2020</conf-date><pub-id pub-id-type="doi">10.1109/WACV45572.2020.9093475</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="web"><article-title>Ollama</article-title><source>Ollama</source><access-date>2025-04-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ollama.com/">https://ollama.com/</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Pipeline configuration demonstration.</p><media xlink:href="medinform_v14i1e77409_app1.docx" xlink:title="DOCX File, 5577 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Annotation guidelines.</p><media xlink:href="medinform_v14i1e77409_app2.docx" xlink:title="DOCX File, 4879 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Complementary paired t-test evaluation of extractor performance across cross-validation folds.</p><media xlink:href="medinform_v14i1e77409_app3.docx" xlink:title="DOCX File, 3482 KB"/></supplementary-material></app-group></back></article>