<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e76912</article-id><article-id pub-id-type="doi">10.2196/76912</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Named Entity Recognition for Chinese Cancer Electronic Health Records&#x2014;Development and Evaluation of a Domain-Specific BERT Model: Quantitative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Junbai</given-names></name><degrees>MA</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhao</surname><given-names>Butian</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tian</surname><given-names>Xiaohan</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zou</surname><given-names>Zhengkai</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Ruojia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Jiarui</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Du</surname><given-names>Songxing</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Guo</surname><given-names>Fengying</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>School of Management, Beijing University of Chinese Medicine</institution><addr-line>No. 11, North Third Ring Road East, Chaoyang District</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Information Management, Peking University</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff3"><institution>Information Center, Dongfang Hospital</institution><addr-line>Beijing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Cheligeer</surname><given-names>Cheligeer</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chrimes</surname><given-names>Dillon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Tekkali</surname><given-names>Praveen</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Fengying Guo, MS, School of Management, Beijing University of Chinese Medicine, No. 11, North Third Ring Road East, Chaoyang District, Beijing, 100029, China, 86 13811833948; <email>guofy@bucm.edu.cn</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>14</day><month>11</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e76912</elocation-id><history><date date-type="received"><day>04</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>13</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Junbai Chen, Butian Zhao, Xiaohan Tian, Zhengkai Zou, Ruojia Wang, Jiarui Wu, Songxing Du, Fengying Guo. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 14.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e76912"/><abstract><sec><title>Background</title><p>The unstructured data of Chinese cancer electronic health records (EHRs) contains valuable medical expertise. Accurate medical entity recognition is crucial for building a medical-assisted decision system. Named entity recognition (NER) in cancer EHRs typically uses general models designed for English medical records. There is a lack of specialized handling for cancer-specific records and limited application to Chinese medical records.</p></sec><sec><title>Objective</title><p>This study aims to propose a specific NER model to enhance the recognition of medical entities in Chinese cancer EHRs.</p></sec><sec sec-type="methods"><title>Methods</title><p>Desensitized inpatient EHRs related to breast cancer were collected from a leading hospital in Beijing. Building upon the MC Bidirectional Encoder Representations from Transformers (BERT) foundation, the study further incorporated a Chinese cancer corpus for pretraining, resulting in the construction of the ChCancerBERT pretrained model. In conjunction with dilated-gated convolutional neural networks, bidirectional long short-term memory, multihead attention mechanism, and a conditional random field, this model forms a multimodel, multilevel integrated NER approach.</p></sec><sec sec-type="results"><title>Results</title><p>This approach effectively extracts medical entity features related to symptoms, signs, tests, treatments, and time in Chinese breast cancer EHRs. The entity recognition performance of the proposed model surpasses that of the baseline model and other models compared in the experiment. The <italic>F</italic><sub>1</sub>-score reached 86.93%, precision reached 87.24%, and recall reached 86.61%. The model introduced in this study demonstrates exceptional performance on the CCKS2019 dataset, attaining a precision rate of 87.26%, a recall rate of 87.27%, and an impressive <italic>F</italic><sub>1</sub>-score of 87.26%, surpassing that of existing models.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The experiments demonstrate that the approach proposed in this study exhibits excellent performance in NER within breast cancer EHRs. This advancement will further contribute to clinical decision support for cancer treatment and research. In addition, the study reveals that incorporating domain-specific corpora in clinical NER tasks can further enhance the performance of BERT models in specialized domains.</p></sec></abstract><kwd-group><kwd>BERT</kwd><kwd>named entity recognition</kwd><kwd>cancer</kwd><kwd>electronic health records</kwd><kwd>deep learning</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Cancer is one of the leading causes of mortality [<xref ref-type="bibr" rid="ref1">1</xref>], imposing a significant psychological burden on patients and potentially triggering mental health disorders such as anxiety and depression [<xref ref-type="bibr" rid="ref2">2</xref>]. The proliferation of electronic health records (EHRs) has provided a crucial source of demographic information, medical history, diagnostic tests, and clinical treatment data for cancer research, aiding in better diagnosis, prognosis, and treatment of the disease [<xref ref-type="bibr" rid="ref3">3</xref>]. However, the large amount of unstructured text in EHRs presents significant challenges for clinical research and analysis. Structuring this unstructured information is essential for subsequent data analysis and mining to support clinical decision-making. Named entity recognition (NER) is a critical component of natural language processing (NLP), plays a key role in this process by identifying and classifying entities, allowing for transforming raw clinical texts into structured data, thereby enabling large-scale data mining and supporting intelligent health care applications [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>In recent years, many studies have explored different NER methods to structure EHRs. The widespread application of pretrained models such as Bidirectional Encoder Representations from Transformers (BERT) has significantly enhanced NER performance in the medical field [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. These improvements are especially valuable in the medical domain, where precise recognition of specialized entities directly impacts the reliability of downstream analyses including cohort selection, clinical decision support, and the construction of medical knowledge bases [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. However, most of these studies focus on English texts, and the differences between English and Chinese in terms of language expression may limit the generalizability of English medical NER methods when directly applied to Chinese medical texts.</p><p>Compared to medical texts in common diseases, cancer EHRs contain richer and more complex information. These records span multiple fields, including medicine, biology, and pharmacology, and encompass specific medical terminologies such as cancer staging, treatment plans, and drug names. They also include diagnostic and treatment records like surgical notes and pathology results, with a prevalent occurrence of nested entities. In current NER tasks for the cancer domain, models such as BERT and other deep learning approaches based on general corpora are often used without fine-tuning for the numerous specialized terms and specific expressions in this field, which may result in errors or omissions. In addition, there is no publicly available dataset benchmark for the cancer domain. Therefore, NER tasks for Chinese cancer EHRs remain both challenging and necessary.</p><p>To address the aforementioned issues, this study contributes the following:</p><list list-type="order"><list-item><p>We retrained the MC-BERT model using a corpus specific to the Chinese cancer domain, resulting in the ChCancerBERT pretrained model tailored for NER tasks in the Chinese cancer field. This approach effectively enhances the semantic representation capabilities of pretrained models in a specialized domain.</p></list-item><list-item><p>In the Chinese medical domain, we built a hybrid model to capture temporal features and bidirectional semantic information. This combination enhances the model&#x2019;s ability to perform multidimensional feature modeling.</p></list-item><list-item><p>To validate the model&#x2019;s effectiveness, we created a Chinese breast cancer EHR corpus with manual annotations. We applied various models to this corpus, and the experimental results demonstrated that our proposed model outperforms existing models, achieving superior performance. We also applied the model proposed in this study to the CCKS2019 dataset and compared the results with those from existing studies for evaluation.</p></list-item></list></sec><sec id="s1-2"><title>Related Work</title><p>In the medical field, prior research has developed various NER methods for extracting information from unstructured data such as EHRs. <xref ref-type="table" rid="table1">Table 1</xref> summarizes related studies and methods for clinical NER in the medical field.</p><p>NER techniques can be categorized into 3 main types: dictionary and rule-based methods, machine learning&#x2013;based methods, and deep learning&#x2013;based methods. Dictionary and rule-based NER methods rely primarily on researchers analyzing textual writing rules and establishing dictionaries, then using regular expression matching to achieve entity recognition and information extraction. For instance, Najafabadipour et al [<xref ref-type="bibr" rid="ref9">9</xref>] extracted multiple cancer concepts such as tumor staging, mutation status, and patient presentation from lung cancer clinical records using regular expressions and the UMLS8 dictionary. Yim et al [<xref ref-type="bibr" rid="ref10">10</xref>] used rule-based extraction to identify 3 liver cancer entities: tumor size, staging level, and percentage of tumors invading the liver. These methods are straightforward and efficient but lack generalizability and require significant human effort to build dictionaries [<xref ref-type="bibr" rid="ref10">10</xref>]. Machine learning&#x2013;based methods use sequence labeling and train models such as hidden Markov model, conditional random fields (CRFs), and maximum entropy Markov model on large manually annotated feature corpora to achieve NER. For example, Savova et al [<xref ref-type="bibr" rid="ref11">11</xref>] developed the DeepPhe software, which uses a combination of rules, domain knowledge bases, and machine learning to extract cancer phenotypes from clinical records. Weegar et al [<xref ref-type="bibr" rid="ref12">12</xref>] used a CRF model to extract cervical cancer symptom information.</p><p>In recent years, deep learning methods have made significant progress in medical NER tasks, demonstrating substantial advantages in NLP compared to traditional feature-based machine learning methods. These approaches typically predict the boundaries and types of entities by labeling each word, thereby capturing deeper and more abstract features. For instance, An et al [<xref ref-type="bibr" rid="ref13">13</xref>] used a bidirectional long short-term memory (BiLSTM)-CRF model with a multihead attention (MHA) mechanism to perform Chinese clinical EHR NER. Kong et al [<xref ref-type="bibr" rid="ref14">14</xref>] combined multilayer convolutional neural networks (CNNs) with an attention mechanism to improve NER in Chinese EHRs. The emergence of pretrained models like BERT has further enhanced the performance of NER. In the medical domain, existing research shows that BERT-based models can be effectively applied to medical information extraction tasks. Li et al [<xref ref-type="bibr" rid="ref5">5</xref>] achieved excellent results using a BERT-BiLSTM-CRF model on the CCKS2018 and CCKS2019 datasets. Li et al [<xref ref-type="bibr" rid="ref15">15</xref>] proposed an A Lite Bidirectional Encoder Representations from Transformers (ALBERT)-based model with a MHA mechanism for Chinese medical NER, validated on the CCKS2019 dataset. Chen et al [<xref ref-type="bibr" rid="ref6">6</xref>] constructed a hybrid model combining MC-BERT, BiLSTM, CNN, MHA, and CRF to achieve NER in Chinese EHRs. Most of these studies primarily applied their deep learning models to publicly available datasets such as CCKS2017 and CCKS2019, without further testing them on specific medical departments or diseases.</p><p>Existing research indicates that using domain-specific text as training data, as opposed to general language models, can yield better performance. For the cancer field, Zhou et al [<xref ref-type="bibr" rid="ref7">7</xref>] proposed CancerBERT, a pretrained model specifically designed for the English language in the cancer domain. Currently, there is no similar model for the Chinese cancer domain. Given the differences between Chinese and English in terms of vocabulary, sentence structure, grammatical rules, and semantic expression, directly applying models trained on English corpora may lead to suboptimal results.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of related work on clinical named entity recognition methods in the medical and cancer domains.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Proposal</td><td align="left" valign="bottom">Approach</td><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Domain</td><td align="left" valign="bottom">Domain model</td><td align="left" valign="bottom">Benchmark</td><td align="left" valign="bottom">Language</td></tr></thead><tbody><tr><td align="left" valign="top">Najabadipour et al 2018 [<xref ref-type="bibr" rid="ref9">9</xref>]</td><td align="left" valign="top">Rules</td><td align="left" valign="top">Rule-based methods</td><td align="left" valign="top">Cancer</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">Spanish</td></tr><tr><td align="left" valign="top">Yim et al.2016 [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">Rules</td><td align="left" valign="top">Rule-based methods</td><td align="left" valign="top">Cancer</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">English</td></tr><tr><td align="left" valign="top">Savova et al 2017 [<xref ref-type="bibr" rid="ref11">11</xref>]</td><td align="left" valign="top">Rules and machine learning</td><td align="left" valign="top">Rule-based methods</td><td align="left" valign="top">Cancer</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">English</td></tr><tr><td align="left" valign="top">Weegar et al 2015 [<xref ref-type="bibr" rid="ref12">12</xref>]</td><td align="left" valign="top">Machine learning</td><td align="left" valign="top">CRF<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">Cancer</td><td align="left" valign="top">No</td><td align="left" valign="top">No</td><td align="left" valign="top">English</td></tr><tr><td align="left" valign="top">An et al 2022 [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">Deep learning</td><td align="left" valign="top">BiLSTM<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>-MHA<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>-CRF</td><td align="left" valign="top">Medical</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Chinese</td></tr><tr><td align="left" valign="top">Kong et al [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">Deep learning</td><td align="left" valign="top">CNN<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">Medical</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Chinese</td></tr><tr><td align="left" valign="top">Li et al 2022 [<xref ref-type="bibr" rid="ref5">5</xref>]</td><td align="left" valign="top">Deep learning</td><td align="left" valign="top">ALBERT<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup>-IDCNN<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>-MHA-CRF</td><td align="left" valign="top">Medical</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Chinese</td></tr><tr><td align="left" valign="top">Li et al 2020 [<xref ref-type="bibr" rid="ref5">5</xref>]</td><td align="left" valign="top">Deep learning</td><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup>-BiLSTM-CRF</td><td align="left" valign="top">Medical</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Chinese</td></tr><tr><td align="left" valign="top">Chen et al 2022 [<xref ref-type="bibr" rid="ref6">6</xref>]</td><td align="left" valign="top">Deep learning</td><td align="left" valign="top">MC-BERT-BiLSTM-CNN-MHA-CRF</td><td align="left" valign="top">Medical</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Chinese</td></tr><tr><td align="left" valign="top">Li et al 2023 [<xref ref-type="bibr" rid="ref16">16</xref>]</td><td align="left" valign="top">Deep learning</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>MC-BERT</p></list-item><list-item><p>GCN<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup>-CRF</p></list-item></list></td><td align="left" valign="top">Medical</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Chinese</td></tr><tr><td align="left" valign="top">Zhou et al 2022,2023 [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]</td><td align="left" valign="top">Deep learning</td><td align="left" valign="top">CancerBERT</td><td align="left" valign="top">Cancer</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">English</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>CRF: conditional random field.</p></fn><fn id="table1fn2"><p><sup>b</sup>BiLSTM: bidirectional long short-term memory.</p></fn><fn id="table1fn3"><p><sup>c</sup>MHA: multihead attention</p></fn><fn id="table1fn4"><p><sup>d</sup>CNN: convolutional neural network.</p></fn><fn id="table1fn5"><p><sup>e</sup>ALBERT: A Lite Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table1fn6"><p><sup>f</sup>IDCNN: iterated dilated convolutional neural network.</p></fn><fn id="table1fn7"><p><sup>g</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table1fn8"><p><sup>h</sup>GCN: graph convolutional network.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>Based on Article 32 of the <italic>Measures for Ethical Review of Life Sciences and Medical Research Involving Human Subjects</italic> [<xref ref-type="bibr" rid="ref17">17</xref>] research using human data or biological samples that does not cause harm to the human body, does not involve sensitive personal information or commercial interests, is eligible for exemption from ethical review. This exemption is intended to reduce unnecessary burdens on researchers and to facilitate the progress of life sciences and medical research. In our study, the dataset used has undergone thorough anonymization, with all personal patient information removed. In addition, the patients involved in the clinical activities at the hospital have already signed the relevant informed consent forms, confirming their agreement to the use of their data for research purposes. Therefore, as per the national regulations, our study did not require ethical review. The data were granted permission to be accessed by SD, who is the director of the Information Center of Dongfang Hospital Beijing University of Chinese Medicine, responsible for managing all the hospital's data, and he is listed as a contributing author.</p></sec><sec id="s2-2"><title>Model Construction</title><p>The proposed model architecture in this study is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The architecture of the proposed model, for ease of viewing and understanding, the input text examples in the figure are accompanied by corresponding English translations. BiLSTM: bidirectional long short-term memory; CRF: conditional random field; DGCNN: dilated-gated convolutional neural network; LSTM: long short-term memory.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e76912_fig01.png"/></fig><p>The model comprises 4 parts: the vector representation layer, feature extraction layer, feature fusion layer, and CRF layer. The Chinese cancer EHR texts are input into the model, where the input text is segmented into individual characters to form the basic sequence <inline-formula><mml:math id="ieqn1"><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:math></inline-formula>, where <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the <italic>i</italic>-th character. Initially, in the vector representation layer, the input sequences <inline-formula><mml:math id="ieqn3"><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:math></inline-formula> are represented as semantically rich character-level embedding vectors <inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> using the autonomously trained BERT model tailored to the Chinese cancer domain (ChCancerBERT), forming the embedding sequence <inline-formula><mml:math id="ieqn5"><mml:mi>H</mml:mi><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:math></inline-formula>. Subsequently, the sequence <inline-formula><mml:math id="ieqn6"><mml:mi>H</mml:mi><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:math></inline-formula> is fed into the BiLSTM and dilated-gated convolutional neural network (DGCNN) models within the feature extraction layer. The feature extraction layer uses BiLSTM to extract long-distance dependency associations and temporal features from the sequences, while DGCNN captures spatial features. In the feature fusion layer, a MHA mechanism with enhanced generalization capability dynamically fuses the features extracted by BiLSTM and DGCNN to enhance the model&#x2019;s feature representation capability. The CRF layer imposes constraints on the output to obtain the optimal labeling results. Finally, the model outputs a set of label sequences <inline-formula><mml:math id="ieqn7"><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:math></inline-formula>, where each label <inline-formula><mml:math id="ieqn8"><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> corresponds to a character in the input text. For instance, &#x201C;B-TIME&#x201D; denotes the beginning of a TIME entity, while &#x201C;I-TIME&#x201D; denotes the inside of a TIME entity, with the label sequence indicating the model&#x2019;s classification results for each character.</p></sec><sec id="s2-3"><title>Chinese Cancer Specialized BERT</title><p>To better capture semantics in cancer electronic medical records, this study trained a specialized BERT model tailored for the Chinese cancer domain (ChCancerBERT) to enhance the performance of extracting cancer-related phenotypes. <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the training process of the ChCancerBERT model.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The training process of ChCancerBERT models. BERT: Bidirectional Encoder Representations from Transformers; CNKI: China National Knowledge Infrastructure; MC: ;MLM: masked language model; NSP: next sentence prediction</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e76912_fig02.png"/></fig><p>Pretrained models are typically based on large-scale general corpora for data pre-training. Depending on the specific needs, they can also incorporate domain-specific vocabulary to calculate the probability P(S) of all possible language sequences.</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>S</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x220F;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>The most commonly used pretrained model is the BERT model, a deep bidirectional language representation model that offers significant advantages in capturing textual semantic information and entity extraction [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. Although BERT achieves good results in most domains, its generic nature may limit its ability to fully capture the specialized terminology and semantic nuances of the medical field [<xref ref-type="bibr" rid="ref15">15</xref>]. MC-BERT fine-tuned on BERT by using different pretraining strategies to incorporate medical-specific and related vocabulary, enabling superior performance in medical NER tasks [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>The ChCancerBERT model proposed in this study is further pretrained based on the MC-BERT model. The specific steps are as follows: firstly, a Chinese cancer corpus was constructed. Using keywords &#x201C;cancer&#x201D; and &#x201C;tumor,&#x201D; a search strategy retrieved relevant Chinese-language literature from the China National Knowledge Infrastructure (CNKI) database. The corpus included abstracts from the first 6000 articles, organized chronologically, resulting in a comprehensive dataset of 3,268,570 tokens. This dataset ensures that ChCancerBERT is exposed to terminology and context unique to the cancer domain, thus improving its capacity for semantic understanding. Second, the pretraining of ChCancerBERT used both masked language model and next sentence prediction tasks to enhance semantic representation. Based on the masked language model, the approach involves randomly masking certain characters within a sentence to learn the semantic information of each character in context. This encourages ChCancerBERT to learn nuanced semantic relationships, particularly for specialized terminology within cancer-specific contexts. In addition, the next sentence prediction model is used to learn the association between 2 consecutive sentences, enabling it to capture long-distance dependencies and contextual flows frequently present in medical records. This step is particularly beneficial for understanding the multisentence descriptions often found in medical documents.</p></sec><sec id="s2-4"><title>BiLSTM Model</title><p>Long short-term memory (LSTM) is a specialized form of recurrent neural network that can capture larger and longer-distance information. By using a set of gate controllers, LSTM effectively addresses the gradient vanishing and exploding problems inherent in traditional recurrent neural networks [<xref ref-type="bibr" rid="ref21">21</xref>]. An LSTM, the input gate <inline-formula><mml:math id="ieqn9"><mml:msub><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> controls information into the current cell, the forget gate <inline-formula><mml:math id="ieqn10"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> manages the forgetting of information, and the output gate <inline-formula><mml:math id="ieqn11"><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> controls the information output. <inline-formula><mml:math id="ieqn12"><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> records the output at the current time step, while <inline-formula><mml:math id="ieqn13"><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> preserves the current cell state and serves as input for the next cell. The sigmoid activation function is denoted by <inline-formula><mml:math id="ieqn14"><mml:mi>&#x03C3;</mml:mi></mml:math></inline-formula>, the weight matrix by <inline-formula><mml:math id="ieqn15"><mml:mi> </mml:mi><mml:mi>W</mml:mi></mml:math></inline-formula>, the bias term by <inline-formula><mml:math id="ieqn16"><mml:mi>b</mml:mi></mml:math></inline-formula>, and the current cell input by <inline-formula><mml:math id="ieqn17"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. <inline-formula><mml:math id="ieqn18"><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the updated state at the current time step, and the hyperbolic tangent activation function is denoted by <inline-formula><mml:math id="ieqn19"><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">h</mml:mi></mml:math></inline-formula>:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:msub><mml:mi>i</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>o</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>tanh</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>i</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>o</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>h</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"/></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>BiLSTM consists of a forward LSTM and a backward LSTM, which respectively capture forward and backward information from the text sequence [<xref ref-type="bibr" rid="ref22">22</xref>]. These are then concatenated to obtain the final hidden layer feature representation. Compared to a single LSTM model, BiLSTM provides a more nuanced understanding of bidirectional semantic dependencies and allows for more precise semantic discrimination [<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>In our model, BiLSTM is applied immediately after the vector representation layer, where ChCancerBERT generates the character-level embeddings tailored to the cancer domain. These embeddings are then fed into the BiLSTM, which consists of a forward and backward LSTMs, capturing both forward and backward contextual information. By leveraging BiLSTM, the model can retain bidirectional semantic dependencies that are crucial in understanding complex medical texts.</p></sec><sec id="s2-5"><title>DGCNN Model</title><p>DGCNN is a neural network model with residual connections that uses gated dilated convolutions to gather more information through stacked convolutional layers [<xref ref-type="bibr" rid="ref4">4</xref>]. Its convolutional structure, which enables the model to analyze sequences in segments, examining local patterns that are essential to textual spatial characteristics, which are usually referred to the arrangement and relationships of words and phrases that form meaningful units [<xref ref-type="bibr" rid="ref4">4</xref>]. These units include nearby dependencies and longer-range relationships, like multiword medical entities in EHRs. DGCNN&#x2019;s use of dilated convolutions allows it to capture these features at different scales or distances, enabling it to handle both close and distant relationships within the text [<xref ref-type="bibr" rid="ref14">14</xref>]. The convolution operations proceed as follows:</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:munderover><mml:mo>&#x2A01;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x00B1;</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>In the model, <inline-formula><mml:math id="ieqn20"><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes a filter with window size <italic>r</italic>, and <inline-formula><mml:math id="ieqn21"><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the output after convolution transformation. The symbol &#x2295; indicates vector concatenation. Building on this, dilation is introduced for interval sampling of data, enabling the coverage of longer sentences. The calculation method for dilated convolution is as follows:</p><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:munderover><mml:mo>&#x2A01;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x00B1;</mml:mo><mml:mi>k</mml:mi><mml:mrow><mml:mi>&#x03B4;</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p><inline-formula><mml:math id="ieqn22"><mml:mi mathvariant="normal">&#x03B4;</mml:mi></mml:math></inline-formula> represents the dilation rate, and <inline-formula><mml:math id="ieqn23"><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is a filter with window size <inline-formula><mml:math id="ieqn24"><mml:mi>r</mml:mi></mml:math></inline-formula>. <inline-formula><mml:math id="ieqn25"><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the output, while &#x2295; signifies vector concatenation. A gating mechanism is also incorporated to control data flow and reduce the risk of gradient vanishing. The operations within each convolutional gated unit are as follows:</p><disp-formula id="E5"><label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>1</mml:mn><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2297;</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>2</mml:mn><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p><inline-formula><mml:math id="ieqn26"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> represents the input. <inline-formula><mml:math id="ieqn27"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>1</mml:mn><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn28"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>2</mml:mn><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> are dilated convolutions with the same parameter structure but without sharing weights. <inline-formula><mml:math id="ieqn29"><mml:mi>&#x03C3;</mml:mi></mml:math></inline-formula> denotes the sigmoid function, and <inline-formula><mml:math id="ieqn30"><mml:mo>&#x2297;</mml:mo></mml:math></inline-formula> represents the Hadamard product of vectors.</p><p>After the ChCancerBERT model generates the character-level embeddings, these embeddings are input independently into the DGCNN model, extracting spatial dependencies and local features, which are essential for identifying nested entities and capturing complex structures within cancer-related records.</p></sec><sec id="s2-6"><title>MHA Mechanism</title><p>The attention mechanism simulates human attention by scoring and weighting important content to highlight significant features [<xref ref-type="bibr" rid="ref24">24</xref>]. The calculation formula is as follows:</p><disp-formula id="E6"><label>(6)</label><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mi>Q</mml:mi><mml:msup><mml:mi>K</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:msqrt><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:msqrt></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:mi>V</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>The <inline-formula><mml:math id="ieqn31"><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula>value represents the attention score, where <inline-formula><mml:math id="ieqn32"><mml:mi>Q</mml:mi></mml:math></inline-formula>s the query vector matrix, <inline-formula><mml:math id="ieqn33"><mml:mi>K</mml:mi></mml:math></inline-formula> is the key vector matrix, and <inline-formula><mml:math id="ieqn34"><mml:mi>V</mml:mi></mml:math></inline-formula> is the value vector matrix. <inline-formula><mml:math id="ieqn35"><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the word vector dimension which normalizes the similarity between the query and key vectors calculated via <inline-formula><mml:math id="ieqn36"><mml:mi>Q</mml:mi><mml:msup><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> to prevent overly large results. <inline-formula><mml:math id="ieqn37"><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:math></inline-formula> normalization is then applied, followed by multiplication with the value vector to obtain the final attention value. When vector dimensions are high, a single attention calculation may not capture all features of a word. Thus, the MHA mechanism is used, linearly mapping <inline-formula><mml:math id="ieqn38"><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi></mml:math></inline-formula> into multiple subspaces and combining the calculated results:</p><disp-formula id="E7"><label>(7)</label><mml:math id="eqn7"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>Q</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mi>Q</mml:mi><mml:msup><mml:mi>K</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:msqrt><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:msqrt></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:mi>V</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>H</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:msup><mml:mi>W</mml:mi><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"/></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>This study uses an MHA mechanism to dynamically fuse the BiLSTM and DGCNN models. First, the outputs of both models are mapped to separate feature spaces, generating 2 sets of  <inline-formula><mml:math id="ieqn39"><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi></mml:math></inline-formula> matrices. Next, the attention values for each model are calculated, resulting in 2 attention representation matrices. Finally, these matrices are combined into a single matrix through matrix multiplication.</p></sec><sec id="s2-7"><title>CRF Model</title><p>CRF is a sequence modeling framework that incorporates constraints between labels to assign higher probabilities to reasonable label sequences [<xref ref-type="bibr" rid="ref20">20</xref>]. This approach effectively addresses the issue of label bias.</p><p>The probability formula for obtaining the predicted label sequence <inline-formula><mml:math id="ieqn40"><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">n</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></inline-formula>given the input sequence <inline-formula><mml:math id="ieqn41"><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">n</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></inline-formula> is as follows:</p><disp-formula id="E8"><label>(8)</label><mml:math id="eqn8"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>Y</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mi>T</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:msup><mml:mi>Y</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mi>T</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mi>Y</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Given the true label sequence <inline-formula><mml:math id="ieqn42"><mml:msup><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">`</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, and all possible label sequences <inline-formula><mml:math id="ieqn43"><mml:mi mathvariant="normal"> </mml:mi><mml:msub><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, the likelihood function for predicting the label sequence <inline-formula><mml:math id="ieqn44"><mml:mi>Y</mml:mi></mml:math></inline-formula> is as follows:</p><disp-formula id="E9"><label>(9)</label><mml:math id="eqn9"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>ln</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>Y</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">T</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>ln</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:msup><mml:mi>Y</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mi>T</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mi>Y</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2032;</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>By maximizing the likelihood function, the most probable and reasonable label sequence is determined and output. The specific formula is as follows:</p><disp-formula id="E10"><label>(10)</label><mml:math id="eqn10"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msup><mml:mi>Y</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo>&#x223C;</mml:mo></mml:mover><mml:mo>&#x2208;</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mi>s</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mover><mml:mi>y</mml:mi><mml:mo>&#x223C;</mml:mo></mml:mover></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-8"><title>Experiment Setups</title><sec id="s2-8-1"><title>Dataset Construction</title><p>The study&#x2019;s data were sourced from the inpatient EHRs at a renowned hospital in Beijing, China, spanning from 2013 to 2022. After excluding patients with unrelated admission reasons (ie, those who have previously had breast cancer but whose current reason for admission is not breast cancer), the illness histories from all patients&#x2019; admission records within this time period were included. Each patient had 1 record per visit, and we ultimately obtained 876 records of patients with breast cancer (total tokens: 528,925, average length: 659.67, SD 354.23). The data received full approval from the hospital. All procedures in this research adhere to ethical standards. Patient privacy data were deidentified before manipulation, ensuring anonymity and confidentiality of personal information.</p></sec><sec id="s2-8-2"><title>Annotation Procedure for the Dataset</title><p>The annotation of data from 876 admission records of patients with breast cancer was a collaborative effort involving 7 annotators. All of these annotators were the students of the Management School of Beijing University of Chinese Medicine and possessed relevant medical and computer expertise. To enhance the precision of the labeling outcomes, 2 oncology clinicians from the Beijing University of Chinese Medicine were enlisted to train the annotators. They also facilitated content segmentation and offered guidance throughout the labeling process. Based on information extracted from the collected data, in conjunction with insights from pertinent clinical studies [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>] and guided by categorizations and classifications in clinical literature, as well as referencing indicators present in public medical databases such as SEER, CCKS2017, CCKS2019, along with existing study labeling protocols [<xref ref-type="bibr" rid="ref27">27</xref>], an index system (<xref ref-type="table" rid="table2">Table 2</xref>) and labeling criteria (<xref ref-type="table" rid="table3">Table 3</xref>) were formulated for this research. These encompassed symptoms, tests, treatments, and time.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Index system of the annotated dataset for Chinese breast cancer electrical health records (EHRs).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Class</td><td align="left" valign="bottom">Entity</td></tr></thead><tbody><tr><td align="left" valign="top">Symptoms</td><td align="left" valign="top">Nodule</td></tr><tr><td align="left" valign="top">Tests</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Vascular tumor thrombus and nerve aggress</p></list-item><list-item><p>Incisal edge</p></list-item><list-item><p>Lymph node dissection</p></list-item><list-item><p>Birads</p></list-item><list-item><p>TNM<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></p></list-item><list-item><p>ER<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></p></list-item><list-item><p>PR<sup><xref ref-type="table-fn" rid="table2fn1">c</xref></sup></p></list-item><list-item><p>HER2<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></p></list-item><list-item><p>Ki67</p></list-item></list></td></tr><tr><td align="left" valign="top">Treatments</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Surgery name</p></list-item><list-item><p>Chemotherapy</p></list-item><list-item><p>Radiotherapy</p></list-item><list-item><p>Targeted therapy</p></list-item><list-item><p>Endocrine therapy</p></list-item></list></td></tr><tr><td align="left" valign="top">Time</td><td align="left" valign="top">Time</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>TNM: TNM classification of malignant tumors.</p></fn><fn id="table2fn2"><p><sup>b</sup>ER: Estrogen receptor. </p></fn><fn id="table2fn3"><p><sup>c</sup>PR: progesterone receptor.</p></fn><fn id="table2fn4"><p><sup>d</sup>HER2: human epidermal growth factor receptor-2. </p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Entity labeling standards with examples<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Entity</td><td align="left" valign="bottom">Standard</td><td align="left" valign="bottom">Example</td><td align="left" valign="bottom">Example in English</td></tr></thead><tbody><tr><td align="left" valign="top">Time</td><td align="left" valign="top">Date and time in various formats</td><td align="left" valign="top">2011&#x5E74;</td><td align="left" valign="top">2011</td></tr><tr><td align="left" valign="top">Surgery name</td><td align="left" valign="top">Name of a patient undergoing surgery</td><td align="left" valign="top">&#x5DE6;&#x4FA7;&#x4E73;&#x817A;&#x764C;&#x6839;&#x6CBB;&#x672F;</td><td align="left" valign="top">Left breast cancer radical surgery</td></tr><tr><td align="left" valign="top">Nodule</td><td align="left" valign="top">Examination results and location of breast nodules</td><td align="left" valign="top">&#x53F3;&#x4E73;&#x817A;&#x80BF;&#x7269;</td><td align="left" valign="top">Right breast mass</td></tr><tr><td align="left" valign="top">Chemotherapy</td><td align="left" valign="top">Patient chemotherapy regimen, medication, cycle</td><td align="left" valign="top">&#x672F;&#x540E;&#x884C;&#x8F85;&#x52A9;&#x5316;&#x7597;6&#x5468;&#x671F; (&#x7D2B;&#x6749;&#x9187;+&#x73AF;&#x78F7;&#x9170;&#x80FA;&#xFF0C;&#x5177;&#x4F53;&#x5242;&#x91CF;&#x4E0D;&#x8BE6;&#xFF09;</td><td align="left" valign="top">After surgery, undergo adjuvant chemotherapy for 6 cycles (paclitaxel+ cyclophosphamide, specific dosages not specified)</td></tr><tr><td align="left" valign="top">Radiotherapy</td><td align="left" valign="top">The patient&#x2019;s radiotherapy regimen and frequency</td><td align="left" valign="top">&#x653E;&#x7597;25&#x6B21;&#xFF0C;&#x5177;&#x4F53;&#x5242;&#x91CF;&#x4E0D;&#x8BE6;</td><td align="left" valign="top">Undergo 25 sessions of radiation therapy, specific dosage not specified</td></tr><tr><td align="left" valign="top">Targeted therapy</td><td align="left" valign="top">Targeted therapy, drugs</td><td align="left" valign="top">&#x540E;&#x7EED;&#x8D2F;&#x8D6B;&#x8D5B;&#x6C40;&#x9776;&#x5411;&#x6CBB;&#x7597;18&#x6B21;</td><td align="left" valign="top">Followed by 18 sessions of trastuzumab targeted therapy</td></tr><tr><td align="left" valign="top">Endocrine therapy</td><td align="left" valign="top">Endocrine treatment options, drugs</td><td align="left" valign="top">&#x963F;&#x90A3;&#x66F2;&#x5511;1mgQd&#x53E3;&#x670D;&#x5185;&#x5206;&#x6CCC;&#x6CBB;&#x7597;</td><td align="left" valign="top">Anastrozole 1mg once daily, oral administration for endocrine therapy</td></tr><tr><td align="left" valign="top">Incisal edge</td><td align="left" valign="top">Whether the incisal edge of the patient is cancerous</td><td align="left" valign="top">&#x5404;&#x76AE;&#x80A4;&#x5207;&#x7F18;&#x672A;&#x89C1;&#x764C;&#x4FB5;&#x6DA6;</td><td align="left" valign="top">No cancer infiltration is observed at the margins of the skin</td></tr><tr><td align="left" valign="top">Vascular tumor thrombus and nerve aggression</td><td align="left" valign="top">Results of examination of vascular tumor thrombus and nerve aggression</td><td align="left" valign="top">&#x4FB5;&#x53CA;&#x5468;&#x56F4;&#x8102;&#x80AA;&#x7EC4;&#x7EC7;&#x53CA;&#x795E;&#x7ECF;&#x7EA4;&#x7EF4;&#x675F;&#xFF0C;&#x8109;&#x7BA1;&#x4E2D;&#x89C1;&#x764C;&#x6813;</td><td align="left" valign="top">Involvement of surrounding adipose tissue and nerve fiber bundles, with cancer emboli seen in blood vessels</td></tr><tr><td align="left" valign="top">Lymph node dissection</td><td align="left" valign="top">The patient&#x2019;s lymph node dissection results</td><td align="left" valign="top">&#x6DCB;&#x5DF4;&#x7ED3;&#x672A;&#x89C1;&#x8F6C;&#x79FB;&#x764C; (0/21&#xFF09;,&#x814B;&#x7A9D;&#x6DCB;&#x5DF4;&#x7ED3; (0/18&#xFF09;, (&#x53F3;&#x814B;&#x7A9D;&#x6DCB;&#x5DF4;&#x7ED3;&#xFF09;0/3</td><td align="left" valign="top">No metastatic cancer detected in lymph nodes (0/21), axillary lymph nodes (0/18), (right axillary lymph nodes) 0/3</td></tr><tr><td align="left" valign="top">Bi-RADS<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">Bi-RADS classification of breast nodules</td><td align="left" valign="top">BI_RADS4b&#x7C7B;<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">BI-RADS4b<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">TNM<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">The patients&#x2019; TNM stages</td><td align="left" valign="top">pT2N0M0, IIa&#x671F;</td><td align="left" valign="top">pT2N0M0,IIa</td></tr><tr><td align="left" valign="top">ER<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">Patient pathology reported ER results</td><td align="left" valign="top">ER (+++)</td><td align="left" valign="top">ER (+++)</td></tr><tr><td align="left" valign="top">PR<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">Patient pathology reported PR results</td><td align="left" valign="top">PR (+++)</td><td align="left" valign="top">PR (+++)</td></tr><tr><td align="left" valign="top">HER2<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">HER2 results were reported pathologically</td><td align="left" valign="top">HER2 (&#x2013;)</td><td align="left" valign="top">HER2 (&#x2013;)</td></tr><tr><td align="left" valign="top">Ki67</td><td align="left" valign="top">Patient pathology reported KI67 results</td><td align="left" valign="top">ki-67index&#x7EA6;20%</td><td align="left" valign="top">Ki-67 index is approximately 20%</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>All texts to be annotated are in Chinese; for ease of viewing and understanding, the annotated examples in the table are accompanied by corresponding English translations, the patient information presented is simulated.</p></fn><fn id="table3fn2"><p><sup>b</sup>Bi-RADS: breast imaging reporting and data system.</p></fn><fn id="table3fn3"><p><sup>c</sup>TNM: TNM classification of malignant tumors.</p></fn><fn id="table3fn4"><p><sup>d</sup>ER: estrogen receptor. </p></fn><fn id="table3fn5"><p><sup>e</sup>PR: progesterone receptor. </p></fn><fn id="table3fn6"><p><sup>f</sup>HER2: human epidermal growth factor receptor-2. </p></fn></table-wrap-foot></table-wrap><p>The &#x201C;symptom&#x201D; labels serve to indicate pathophysiological alterations and breast cancer symptoms, including details about breast nodules such as their location and dimensions. The &#x201C;tests&#x201D; label designates the array of tests undergone by patients with breast cancer during diagnosis and treatment, aiding physicians in precise assessments. These encompass parameters like resection margins, vascular thrombosis, nerve invasion, lymph node dissection, Bi-RADS grading, TNM staging, and immunohistochemical markers (ER, PR, HER2, and Ki-67). The &#x201C;treatments&#x201D; label encompasses therapies previously administered to patients with breast cancer, comprising chemotherapy, radiation, targeted therapy, endocrine therapy, and surgery. The &#x201C;time&#x201D; label incorporates details regarding each temporal point within the admission record of a patient with breast cancer.</p><p>Data annotation was performed using Label Studio software (Human Signal). A total of 7 trained annotators, each responsible for a distinct portion of the dataset, performed the initial labeling in strict accordance with the predefined annotation guidelines. The annotated corpus was then independently reviewed and revised by 2 experienced clinical oncologists, resulting in 2 expert-annotated versions. To assess interannotator agreement between the 2 experts, Cohen &#x03BA; coefficient was calculated, which indicated almost perfect agreement (&#x03BA;=0.82). Any discrepancies between the 2 expert versions were subsequently identified and resolved through discussion, leading to the final consensus gold-standard dataset. This finalized version was used for training and validation of the entity recognition model.</p></sec><sec id="s2-8-3"><title>Data Partitioning</title><p>For evaluation, we use the hold-out method, partitioning the dataset randomly into training, validation, and test sets, distributed at a ratio of 6:2:2. Diverging from typical NER tasks, this model primarily deals with medical nouns or statements as named entities, with a particular focus on breast cancer&#x2013;related terms. <xref ref-type="table" rid="table4">Table 4</xref> illustrates the count of entities corresponding to each label within the training, validation, and test sets. Concurrently, the length of labels within each category was tabulated, and the outcomes are presented in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Distribution and length statistics of entities in training, validation, and test sets.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" rowspan="2">Class and entity</td><td align="left" valign="top" rowspan="2">Train set, n (%)</td><td align="left" valign="top" rowspan="2">Development set, n (%)</td><td align="left" valign="top" rowspan="2">Test set, n (%)</td><td align="left" valign="top" colspan="3">Entities length</td></tr><tr><td align="left" valign="top">Minimum</td><td align="left" valign="top">Maximum</td><td align="left" valign="top">Mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Symptoms</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nodule</td><td align="char" char="." valign="top">600 (61.66)</td><td align="char" char="." valign="top">192 (19.73)</td><td align="char" char="." valign="top">181 (18.60)</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">123</td><td align="char" char="." valign="top">15.49 (17.84)</td></tr><tr><td align="left" valign="top">Tests</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vascular tumor thrombus and nerve aggress</td><td align="char" char="." valign="top">208 (56.68)</td><td align="char" char="." valign="top">76 (20.71)</td><td align="char" char="." valign="top">83 (22.62)</td><td align="char" char="." valign="top">4</td><td align="char" char="." valign="top">42</td><td align="char" char="." valign="top">13.93 (7.79)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Incisal edge</td><td align="char" char="." valign="top">155 (58.49)</td><td align="char" char="." valign="top">53 (20)</td><td align="char" char="." valign="top">57 (21.51)</td><td align="char" char="." valign="top">4</td><td align="char" char="." valign="top">75</td><td align="char" char="." valign="top">24.15 (17.06)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lymph node dissection</td><td align="char" char="." valign="top">558 (62.21)</td><td align="char" char="." valign="top">137 (15.27)</td><td align="char" char="." valign="top">202 (22.52)</td><td align="char" char="." valign="top">4</td><td align="char" char="." valign="top">139</td><td align="char" char="." valign="top">27.92 (24.67)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Bi-RADS<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="char" char="." valign="top">161 (54.39)</td><td align="char" char="." valign="top">58 (19.59)</td><td align="char" char="." valign="top">77 (26.01)</td><td align="char" char="." valign="top">8</td><td align="char" char="." valign="top">82</td><td align="char" char="." valign="top">48.59 (22.54)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>TNM<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top">55 (57.89)</td><td align="char" char="." valign="top">15 (15.79)</td><td align="char" char="." valign="top">25 (26.32)</td><td align="char" char="." valign="top">5</td><td align="char" char="." valign="top">17</td><td align="char" char="." valign="top">10.97 (3.76)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ER<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="char" char="." valign="top">438 (61.78)</td><td align="char" char="." valign="top">132 (18.62)</td><td align="char" char="." valign="top">139 (19.61)</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">45</td><td align="char" char="." valign="top">9.7 (5.52)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PR<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="char" char="." valign="top">430 (61.69)</td><td align="char" char="." valign="top">131 (18.79)</td><td align="char" char="." valign="top">136 (19.51)</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">33</td><td align="char" char="." valign="top">5.61 (4.88)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>HER2<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="char" char="." valign="top">393 (61.50)</td><td align="char" char="." valign="top">117 (18.31)</td><td align="char" char="." valign="top">129 (20.19)</td><td align="char" char="." valign="top">5</td><td align="char" char="." valign="top">82</td><td align="char" char="." valign="top">13.64 (10.36)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ki67</td><td align="char" char="." valign="top">420 (62.22)</td><td align="char" char="." valign="top">132 (19.56)</td><td align="char" char="." valign="top">123 (18.22)</td><td align="char" char="." valign="top">7</td><td align="char" char="." valign="top">59</td><td align="char" char="." valign="top">14.66 (7.06)</td></tr><tr><td align="left" valign="top" colspan="7">Treatments</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Surgery name</td><td align="char" char="." valign="top">985 (60.39)</td><td align="char" char="." valign="top">304 (18.64)</td><td align="char" char="." valign="top">342 (20.97)</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">74</td><td align="char" char="." valign="top">18.91 (9.06)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chemotherapy</td><td align="char" char="." valign="top">1027 (61.72)</td><td align="char" char="." valign="top">319 (19.17)</td><td align="char" char="." valign="top">318 (19.11)</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">940</td><td align="char" char="." valign="top">101.43 (107.68)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Radiotherapy</td><td align="char" char="." valign="top">177 (62.32)</td><td align="char" char="." valign="top">56 (19.72)</td><td align="char" char="." valign="top">51 (17.96)</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">141</td><td align="char" char="." valign="top">21.32 (21.42)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Targeted therapy</td><td align="char" char="." valign="top">155 (65.96)</td><td align="char" char="." valign="top">30 (12.77)</td><td align="char" char="." valign="top">50 (21.28)</td><td align="char" char="." valign="top">5</td><td align="char" char="." valign="top">352</td><td align="char" char="." valign="top">68.83 (68.16)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Endocrine therapy</td><td align="char" char="." valign="top">229 (61.89)</td><td align="char" char="." valign="top">71 (19.19)</td><td align="char" char="." valign="top">70 (18.92)</td><td align="char" char="." valign="top">5</td><td align="char" char="." valign="top">202</td><td align="char" char="." valign="top">32.55 (26.9)</td></tr><tr><td align="left" valign="top" colspan="7">Time</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Time</td><td align="char" char="." valign="top">2765 (59.54)</td><td align="char" char="." valign="top">892 (19.21)</td><td align="char" char="." valign="top">987 (21.25)</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">395</td><td align="char" char="." valign="top">56.41 (55.25)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Bi-RADS: breast imaging reporting and data system.</p></fn><fn id="table4fn2"><p><sup>b</sup>TNM: TNM classification of malignant tumors.</p></fn><fn id="table4fn3"><p><sup>c</sup>ER: estrogen receptor.</p></fn><fn id="table4fn4"><p><sup>d</sup>PR: progesterone receptor. </p></fn><fn id="table4fn5"><p><sup>e</sup>HER2: human epidermal growth factor receptor-2.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-8-4"><title>Evaluation Metrics</title><p>During the experiment, the evaluation of the NER model used 3 metrics: precision (P), recall (R), and <italic>F</italic><sub>1</sub><italic>-</italic>score (<italic>F</italic><sub>1</sub>). The formulas for calculating precision, recall, and <italic>F</italic><sub>1</sub>-score are as follows:</p><disp-formula id="E11"><label>(11)</label><mml:math id="eqn11"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi></mml:mrow></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mn>100</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>R</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">N</mml:mi></mml:mrow></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mn>100</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mrow><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">R</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>R</mml:mi></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mn>100</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"/></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where TP represents the number of true positive entities, FP indicates the number of false positive entities, and FN denotes the number of false negative entities.</p></sec><sec id="s2-8-5"><title>Implementation Details</title><p>The experiments were conducted using the PyTorch 1.6 framework in a Python (version 3.7; Python Software Foundation) environment. The CPU used is Intel(R) Core i5-12500H, and the GPU used is NVIDIA T4 GPU. The ChCancerBERT model parameters were configured identically to the BERT model, with 12 transformer layers, 12 attention heads, and 768 hidden units. The hyperparameters for the NER experiment are detailed in <xref ref-type="table" rid="table5">Table 5</xref>.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Hyperparameter settings.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Hyperparameters</td><td align="left" valign="bottom">Value</td></tr></thead><tbody><tr><td align="left" valign="top">Batch size</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">Dropout</td><td align="left" valign="top">0.5</td></tr><tr><td align="left" valign="top">Learning rate</td><td align="left" valign="top">0.00003</td></tr><tr><td align="left" valign="top">Epoch</td><td align="left" valign="top">64</td></tr><tr><td align="left" valign="top">Maximum length</td><td align="left" valign="top">128</td></tr><tr><td align="left" valign="top">Embedding</td><td align="left" valign="top">768</td></tr></tbody></table></table-wrap></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Using the partitioned dataset, we devised comparative experiments, ablation experiments, and separate pretrained model comparative experiments to facilitate a comprehensive assessment of the recognition efficacy across various mechanisms and models.</p><sec id="s3-1"><title>Results of the Comparative Experiment for NER Models</title><p>We applied our proposed model and the NER models mentioned in <xref ref-type="table" rid="table1">Table 1</xref>, which have been developed for medical and cancer fields in recent years, to our dataset to compare performance. Detailed results can be found in <xref ref-type="table" rid="table6">Table 6</xref>. We did not include Zhou et al CancerBERT model [<xref ref-type="bibr" rid="ref7">7</xref>] in the comparison because it is not yet publicly available. Our results demonstrate that, for the Chinese breast cancer EHR dataset we constructed, our proposed model outperformed existing models, with <italic>F</italic><sub>1</sub>-scores increasing by 1.40%, 2.57%, and 1.13%, respectively.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Comparative performance of baseline and proposed models on the breast cancer dataset.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (%)</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>- BiLSTM<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> -CRF [<xref ref-type="bibr" rid="ref5">5</xref>]</td><td align="left" valign="top">85.77</td><td align="left" valign="top">85.38</td><td align="left" valign="top">85.58</td></tr><tr><td align="left" valign="top">ALBERT<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup>-IDCNN<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup>-MHA-CRF [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">87.78</td><td align="left" valign="top">81.28</td><td align="left" valign="top">84.41</td></tr><tr><td align="left" valign="top">MC-BERT-BiLSTM-CNN<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup>-MHA<sup><xref ref-type="table-fn" rid="table6fn6">f</xref></sup>-CRF<sup><xref ref-type="table-fn" rid="table6fn7">g</xref></sup> [<xref ref-type="bibr" rid="ref6">6</xref>]</td><td align="left" valign="top">85.64</td><td align="left" valign="top">86.07</td><td align="left" valign="top">85.85</td></tr><tr><td align="left" valign="top">Our proposed model</td><td align="left" valign="top">87.63</td><td align="left" valign="top">86.34</td><td align="left" valign="top">86.98</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table6fn2"><p><sup>b</sup>BiLSTM: bidirectional long short-term memory.</p></fn><fn id="table6fn3"><p><sup>c</sup>ALBERT: A Lite Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table6fn4"><p><sup>d</sup>IDCNN: iterated dilated convolutional neural network.</p></fn><fn id="table6fn5"><p><sup>e</sup>CNN: convolutional neural network.</p></fn><fn id="table6fn6"><p><sup>f</sup>MHA: multihead attention.</p></fn><fn id="table6fn7"><p><sup>g</sup>CRF: conditional random field.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Results of the Ablation Experiment</title><p>We employ the fundamental BERT-BiLSTM-CRF model as a baseline to assess the impact of distinct mechanisms on the recognition of named entities within Chinese EHRs pertaining to breast cancer. Subsequently, BERT-DGCNN-CRF replaces the BiLSTM model within the baseline, while the baseline+DGCNN model merges the BiLSTM and DGCNN models. In addition, the baseline+DGCNN+MHA model introduces a MHA mechanism to the BERT-BiLSTM-DGCNN-CRF foundation. The corresponding experimental outcomes are detailed in <xref ref-type="table" rid="table7">Table 7</xref>.</p><p>Analysis of the ablation outcomes reveals that the substitution of the BiLSTM model with the DGCNN model results in a reduction of the precision and <italic>F</italic><sub>1</sub><italic>-</italic>score on the dataset of this study by 0.37% and 0.14%. However, the amalgamation of the DGCNN model and the BILSTM model yields <italic>F</italic><sub>1</sub><italic>-</italic>score increments of 0.35% and 0.49% compared to both the baseline and BERT-DGCNN-CRF models. This amalgamation also improves precision by 0.02% and 0.29%, and recall by 0.69%, compared to the baseline and BERT-DGCNN-CRF models, respectively. This observation underscores that, within the context of feature extraction, the standalone DGCNN model presents no notable superiority over the BiLSTM model. Conversely, the combined application of these 2 models augments recognition effectiveness. Building upon this foundation, the incorporation of the MHA serves to enhance the extraction of pertinent information within the data, thereby mitigating the impact of irrelevant data. Consequently, the <italic>F</italic><sub>1</sub><italic>-</italic>score experiences a supplementary enhancement of 0.17%, with a precision decrease of 0.46% offset by a substantial recall increase of 0.82%. This suggests that although the addition of the MHA slightly reduces precision, it compensates for this by capturing a broader range of relevant information, ultimately enhancing overall recognition effectiveness. Thus, based on the insights garnered from ablation experiments, it is ascertained that the optimal approach for this study involves the incorporation of the BiLSTM-DGCNN fusion model as the feature extraction layer, accompanied by the integration of the MHA mechanism.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Results of ablation studies and pretrained model comparisons.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (%)</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup>-BiLSTM<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup>-CRF<sup><xref ref-type="table-fn" rid="table7fn3">c</xref></sup> (baseline)</td><td align="left" valign="top">85.77</td><td align="left" valign="top">85.38</td><td align="left" valign="top">85.58</td></tr><tr><td align="left" valign="top">BERT-DGCNN<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup>-CRF</td><td align="left" valign="top">85.50</td><td align="left" valign="top">85.38</td><td align="left" valign="top">85.44</td></tr><tr><td align="left" valign="top">BERT-DGCNN-BiLSTM-CRF</td><td align="left" valign="top">85.79</td><td align="left" valign="top">86.07</td><td align="left" valign="top">85.93</td></tr><tr><td align="left" valign="top">BERT-DGCNN-BiLSTM-MHA<sup><xref ref-type="table-fn" rid="table7fn5">e</xref></sup>-CRF</td><td align="left" valign="top">85.33</td><td align="left" valign="top">86.89</td><td align="left" valign="top">86.10</td></tr><tr><td align="left" valign="top">ALBERT<sup><xref ref-type="table-fn" rid="table7fn6">f</xref></sup>-BiLSTM-CRF</td><td align="left" valign="top">87.50</td><td align="left" valign="top">77.32</td><td align="left" valign="top">82.10</td></tr><tr><td align="left" valign="top">ALBERT-DGCNN-BiLSTM-MHA-CRF</td><td align="left" valign="top">86.61</td><td align="left" valign="top">84.98</td><td align="left" valign="top">85.78</td></tr><tr><td align="left" valign="top">RoBERTa<sup><xref ref-type="table-fn" rid="table7fn7">g</xref></sup>-BiLSTM-CRF</td><td align="left" valign="top">86.14</td><td align="left" valign="top">82.51</td><td align="left" valign="top">84.29</td></tr><tr><td align="left" valign="top">RoBERTa-DGCNN-BiLSTM-MHA-CRF</td><td align="left" valign="top">86.86</td><td align="left" valign="top">85.79</td><td align="left" valign="top">86.32</td></tr><tr><td align="left" valign="top">XLNet<sup><xref ref-type="table-fn" rid="table7fn8">h</xref></sup>-BiLSTM-CRF</td><td align="left" valign="top">86.66</td><td align="left" valign="top">78.55</td><td align="left" valign="top">82.40</td></tr><tr><td align="left" valign="top">XLNet-DGCNN-BiLSTM-MHA-CRF</td><td align="left" valign="top">87.52</td><td align="left" valign="top">85.93</td><td align="left" valign="top">86.72</td></tr><tr><td align="left" valign="top">MC-BERT-BiLSTM-CRF</td><td align="left" valign="top">85.36</td><td align="left" valign="top">85.25</td><td align="left" valign="top">85.30</td></tr><tr><td align="left" valign="top">MC-BERT-DGCNN-BiLSTM-MHA-CRF</td><td align="left" valign="top">86.98</td><td align="left" valign="top">86.48</td><td align="left" valign="top">86.73</td></tr><tr><td align="left" valign="top">ChCancerBERT-BiLSTM-CRF</td><td align="left" valign="top">85.75</td><td align="left" valign="top">86.61</td><td align="left" valign="top">86.18</td></tr><tr><td align="left" valign="top">Our proposed model</td><td align="left" valign="top">87.63</td><td align="left" valign="top">86.34</td><td align="left" valign="top">86.98</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table7fn2"><p><sup>b</sup>BiLSTM: bidirectional long short-term memory.</p></fn><fn id="table7fn3"><p><sup>c</sup>CRF: conditional random field.</p></fn><fn id="table7fn4"><p><sup>d</sup>DGCNN: dilated gated convolutional neural network.</p></fn><fn id="table7fn5"><p><sup>e</sup>MHA: multihead attention.</p></fn><fn id="table7fn6"><p><sup>f</sup>ALBERT: A Lite Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table7fn7"><p><sup>g</sup>RoBERTa: robustly optimized Bidirectional Encoder Representations from Transformers pretraining approach.</p></fn><fn id="table7fn8"><p><sup>h</sup>XLNet: generalized autoregressive pretraining.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Results of the Comparative Experiment for the Pretrained Models</title><p>Leveraging the model exhibiting the highest <italic>F</italic><sub>1</sub><italic>-</italic>score from the aforementioned experiments, we proceeded to perform comparative experiments by substituting various pretrained models. Comparison experiments were conducted by integrating different pretrained models with both BiLSTM+CRF and DGCNN-BiLSTM-MHA-CRF architectures. All used pretrained models are available for download from the &#x201C;Models-Hugging Face&#x201D; website. The corresponding outcomes are presented in <xref ref-type="table" rid="table7">Table 7</xref>.</p><p>The results indicate that using the proposed pretraining model, ChCancerBERT, for character-level embedding outperforms various pretrained models compared in the experiment. Specifically, in the comparison experiment combined with BiLSTM+CRF, the <italic>F</italic><sub>1</sub><italic>-</italic>scores of the ChCancerBERT model increased by 0.6%, 4.08%, 1.89%, 3.78%, and 0.88% compared to BERT, ALBERT, RoBERTa, XLNet, and MC-BERT, respectively. In the comparison experiment with the DGCNN-BiLSTM-MHA-CRF architecture, it achieved <italic>F</italic><sub>1</sub><italic>-</italic>score improvements of 0.88%, 1.20%, 0.66%, 0.26%, and 0.25% over BERT, ALBERT, RoBERTa, XLNet, and MC-BERT, respectively. In addition, regardless of the pretraining model used, the <italic>F</italic><sub>1</sub><italic>-</italic>scores obtained with the DGCNN-BiLSTM-MHA-CRF architecture were consistently higher than those obtained with the BiLSTM+CRF model alone, further demonstrating that the DGCNN-BiLSTM-MHA model, by integrating multidimensional text features, possesses a stronger semantic representation capability than the BiLSTM model alone.</p><p>After using ChCancerBERT for character-level embedding and integrating it with the DGCNN-BILSTM-MHA-CRF model to fuse multidimensional text features, the semantic representation capability is further enhanced, and the <italic>F</italic><sub>1</sub><italic>-</italic>score had a 1.4% improvement compared to the current mainstream BERT-BiLSTM-CRF (baseline) model. In addition, our model achieves a precision of 87.63% and a recall of 86.34%, which represents increases of 1.86% in precision and 0.96% in recall compared to the baseline BERT-BiLSTM-CRF model. This reflects both high detection accuracy and robustness in capturing target entities.</p></sec><sec id="s3-4"><title>Error Analysis</title><p>We conducted a comprehensive analysis of the 4 main entity categories in this study, including their quantity, average length, as well as precision, recall, and <italic>F</italic><sub>1</sub><italic>-</italic>scores. The specific results are presented in <xref ref-type="table" rid="table8">Table 8</xref>.</p><p>The results presented in <xref ref-type="table" rid="table8">Table 8</xref> indicate that TIME entities are the most frequently occurring, totaling 4644 instances. The model demonstrates the highest recognition effectiveness for these entities, achieving an <italic>F</italic><sub>1</sub><italic>-</italic>score of 92.79%. In contrast, although the treatments category also has a high number of entities, it shows the lowest recognition performance with an <italic>F</italic><sub>1</sub><italic>-</italic>score of 74.24%. This lower performance can be attributed to the complexity and variability of treatment-related terms, which is further compounded by the fact that the average length of treatment entities is 17.27 characters, significantly longer than other entity categories. Treatment entities often contain multiple details, including various drugs, dosages, treatment methods, and frequencies, such as in the example: &#x201C;Chemotherapy: Cyclophosphamide 1000 mg on day 1, Epirubicin 50 mg on day 1, 21 days per cycle, 4 cycles.&#x201D; However, the model only identifies the drugs and their dosages used in chemotherapy, while ignoring the subsequent frequency information. The increased length and complexity of these entities make it more challenging for the model to accurately identify and classify them, resulting in a decrease in overall recognition accuracy.</p><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Performance obtained for each entity.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Class</td><td align="left" valign="bottom">Entity number</td><td align="left" valign="bottom">Average length</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub><italic>-</italic>score (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Symptoms</td><td align="left" valign="top">973</td><td align="left" valign="top">7.62</td><td align="left" valign="top">82.86</td><td align="left" valign="top">84.89</td><td align="left" valign="top">83.86</td></tr><tr><td align="left" valign="top">Tests</td><td align="left" valign="top">640</td><td align="left" valign="top">7.82</td><td align="left" valign="top">82.61</td><td align="left" valign="top">81.40</td><td align="left" valign="top">81.02</td></tr><tr><td align="left" valign="top">Treatments</td><td align="left" valign="top">4184</td><td align="left" valign="top">17.27</td><td align="left" valign="top">76.90</td><td align="left" valign="top">72.30</td><td align="left" valign="top">74.24</td></tr><tr><td align="left" valign="top">Time</td><td align="left" valign="top">4644</td><td align="left" valign="top">8.01</td><td align="left" valign="top">92.62</td><td align="left" valign="top">92.96</td><td align="left" valign="top">92.79</td></tr></tbody></table></table-wrap></sec><sec id="s3-5"><title>Experiment on CCKS2019</title><p>In order to comprehensively validate the efficacy of the model proposed in this study, we applied it to the CCKS2019 dataset and conducted a performance comparison with models recently introduced in the literature. The CCKS2019 dataset, publicly accessible and broadly used within China&#x2019;s medical NLP research community, offers a standardized evaluation platform for diverse NER models (CCKS2019 [<xref ref-type="bibr" rid="ref28">28</xref>]). In total, we compared 8 models, including 3 models in 5.1, and the other 5 models are as follows:</p><list list-type="bullet"><list-item><p>Embeddings from language models (ELMo)-lattice-LSTM-CRF model: this model uses the lattice LSTM-CRF architecture and integrates variant contextual character representations (Chinese Embeddings from Language Models [ELMo]) to enhance the use of character and word information in Chinese EHRs [<xref ref-type="bibr" rid="ref29">29</xref>].</p></list-item><list-item><p>ELMo-encoder from transformer&#x2013;CRF model: this model fine-tunes the ELMo model using domain-specific clinical records and uses transformer (encoder from transformer) as an encoder to address long-context dependencies [<xref ref-type="bibr" rid="ref30">30</xref>].</p></list-item><list-item><p>All CNN model: this model incorporates Chinese radical characters to enhance the morphosemantic representation of Chinese characters [<xref ref-type="bibr" rid="ref14">14</xref>].</p></list-item><list-item><p>Graph attention network&#x2013;BiLSTM-CRF model: this model uses a graph neural network to model lexical dependency inertia as syntactic semantics and integrates semantic information at various levels to enhance contextual representation [<xref ref-type="bibr" rid="ref31">31</xref>].</p></list-item><list-item><p>Chinese medical&#x2013;NER model: this model enhances the MC-BERT model by incorporating graph convolutional networks and CRFs [<xref ref-type="bibr" rid="ref16">16</xref>].</p></list-item></list><p><xref ref-type="table" rid="table9">Table 9</xref> presents a comparison of the performance of our proposed model on the CCKS2019 dataset with that of models introduced in recent literature. Our model achieves an <italic>F</italic><sub>1</sub><italic>-</italic>score of 87.26%, surpassing recent models by margins of 2.64%, 2.24%, 2.13%, 1.67%, 1.63%, 1.34%, 0.99%, and 0.97%. These results strongly affirm the effectiveness of our model. The experimental findings clearly demonstrate that our cancer-specific named NER model outperforms existing models on the CCKS2019 dataset, highlighting its potential for effectively handling larger medical datasets.</p><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Performance comparison of the proposed model and recent models on the CCKS2019 dataset.</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub><italic>-</italic>score (%)</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table9fn1">a</xref></sup>- BiLSTM<sup><xref ref-type="table-fn" rid="table9fn2">b</xref></sup> -CRF<sup><xref ref-type="table-fn" rid="table9fn3">c</xref></sup> [<xref ref-type="bibr" rid="ref5">5</xref>]</td><td align="left" valign="top">82.09</td><td align="left" valign="top">87.32</td><td align="left" valign="top">84.62</td></tr><tr><td align="left" valign="top">ELMo<sup><xref ref-type="table-fn" rid="table9fn4">d</xref></sup>-lattice-LSTM<sup><xref ref-type="table-fn" rid="table9fn5">e</xref></sup>-CRF [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">85.35</td><td align="left" valign="top">84.69</td><td align="left" valign="top">85.02</td></tr><tr><td align="left" valign="top">ACNN<sup><xref ref-type="table-fn" rid="table9fn6">f</xref></sup> [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">87.29</td><td align="left" valign="top">83.07</td><td align="left" valign="top">85.13</td></tr><tr><td align="left" valign="top">ELMo-ET<sup><xref ref-type="table-fn" rid="table9fn7">g</xref></sup>-CRF model [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">87.61</td><td align="left" valign="top">83.65</td><td align="left" valign="top">85.59</td></tr><tr><td align="left" valign="top">ALBERT<sup><xref ref-type="table-fn" rid="table9fn8">h</xref></sup>-IDCNN<sup><xref ref-type="table-fn" rid="table9fn9">i</xref></sup>-MHA<sup><xref ref-type="table-fn" rid="table9fn10">j</xref></sup>-CRF [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">84.82</td><td align="left" valign="top">86.46</td><td align="left" valign="top">85.63</td></tr><tr><td align="left" valign="top">GAT<sup><xref ref-type="table-fn" rid="table9fn11">k</xref></sup>-BiLSTM-CRF [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">86.74</td><td align="left" valign="top">85.11</td><td align="left" valign="top">85.92</td></tr><tr><td align="left" valign="top">MC-BERT-BiLSTM-CNN<sup><xref ref-type="table-fn" rid="table9fn12">l</xref></sup>-MHA-CRF [<xref ref-type="bibr" rid="ref6">6</xref>]</td><td align="left" valign="top">84.90</td><td align="left" valign="top">87.67</td><td align="left" valign="top">86.27</td></tr><tr><td align="left" valign="top">CM<sup><xref ref-type="table-fn" rid="table9fn13">m</xref></sup>-NER<sup><xref ref-type="table-fn" rid="table9fn14">n</xref></sup> [<xref ref-type="bibr" rid="ref16">16</xref>]</td><td align="left" valign="top">86.45</td><td align="left" valign="top">86.13</td><td align="left" valign="top">86.29</td></tr><tr><td align="left" valign="top">Our proposed model</td><td align="left" valign="top">87.26</td><td align="left" valign="top">87.27</td><td align="left" valign="top">87.26</td></tr></tbody></table><table-wrap-foot><fn id="table9fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table9fn2"><p><sup>b</sup>BiLSTM: bidirectional long short-term memory.</p></fn><fn id="table9fn3"><p><sup>c</sup>CRF: conditional random field.</p></fn><fn id="table9fn4"><p><sup>d</sup>ELMo: embeddings from language models.</p></fn><fn id="table9fn5"><p><sup>e</sup>LSTM: long short-term memory.</p></fn><fn id="table9fn6"><p><sup>f</sup>ACNN: all convolutional neural network.</p></fn><fn id="table9fn7"><p><sup>g</sup>ET: encoder from transformer.</p></fn><fn id="table9fn8"><p><sup>h</sup>ALBERT: A Lite Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table9fn9"><p><sup>i</sup>IDCNN: iterated dilated convolutional neural network.</p></fn><fn id="table9fn10"><p><sup>j</sup>MHA: multihead attention.</p></fn><fn id="table9fn11"><p><sup>k</sup>GAT: graph attention network.</p></fn><fn id="table9fn12"><p><sup>l</sup>CNN: convolutional neural network.</p></fn><fn id="table9fn13"><p><sup>m</sup>CM: Chinese medical.</p></fn><fn id="table9fn14"><p><sup>n</sup>NER: named entity recognition.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Unstructured EHR data contains valuable patient information that can be used for clinical decision support and research [<xref ref-type="bibr" rid="ref32">32</xref>]. In the Chinese breast cancer EHR corpus constructed in this study, there are 876 admission records, comprising a total of 528,925 tokens. Annotations were performed for 4 major categories and 17 subcategories of named entities, with each subcategory having more than 100 annotated instances. In the NER task, the model we propose outperforms other models currently applied in medical-related fields. The results of ablation experiments revealed that the fusion of the pretrained model with the DGCNN-BILSTM-MHA-CRF model effectively enhances entity recognition performance. Specifically, the introduction of the DGCNN structure on top of the baseline results in the maximum improvement of 0.35%. This enhancement is attributed to the structure&#x2019;s ability to mitigate the risk of gradient vanishing and facilitate information transmission across multiple channels, capturing a wealth of feature information. In addition, the incorporation of the MHA mechanism on this foundation not only enhances the model&#x2019;s generalization capability but also enables parallel attention computation, strengthening feature fusion capability [<xref ref-type="bibr" rid="ref24">24</xref>]. This leads to improved use of valid information and reduced impact of irrelevant information, resulting in a further <italic>F</italic><sub>1</sub><italic>-</italic>score improvement of 0.17%.</p><p>Comparing the results of the pretrained model experiments, it is evident that the proposed ChCancerBERT pretrained model outperforms various state-of-the-art BERT models, with the medical domain-specific MC-BERT model ranking second. This is attributed to MC-BERT incorporating more medical corpus training into BERT, and ChCancerBERT further including the Chinese cancer corpus on top of MC-BERT. This augmentation effectively enhances the model&#x2019;s semantic representation capability in the Chinese cancer domain, yielding superior results in the Chinese breast cancer corpus in this study. In addition, the study&#x2019;s results indicate that pretrained BERT models based on specific domains can lead to improved performance in downstream tasks [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Among the 4 categories extracted in this study, the performance in extracting time entities stood out, with an impressive <italic>F</italic><sub>1</sub><italic>-</italic>score of 92.79%. This result can be attributed to the relatively high quantity of time entities in this dataset, amounting to 4644, and the fact that time entities exhibit a more standardized writing format compared to other entities, which, to some extent, reduces the difficulty of recognition and enhances recognition accuracy [<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>We scrutinize the entity recognition outcomes of the most proficient fusion model derived from our experiments. Categories with infrequent occurrences in the dataset, such as TNM, Bi-RADS, and some other entities in class &#x201C;tests,&#x201D; are prone to omission errors. These errors manifest as instances where entities that should have been identified remain undetected, suggesting a propensity of the model used in this study to overlook less common entity categories [<xref ref-type="bibr" rid="ref34">34</xref>]. Such omissions may result in incomplete information extraction, which is crucial for disease diagnosis and the formulation of treatment plans. Therefore, downstream applications should incorporate clinician verification mechanisms or error correction modules to ensure safety in clinical use.</p><p>In this study, entities characterized by considerable length, such as chemotherapy, radiotherapy, and other in-class &#x201C;treatments,&#x201D; are typically classified correctly by the model. However, the model is susceptible to boundary recognition errors [<xref ref-type="bibr" rid="ref7">7</xref>], leading to either the exclusion of words at the sentence&#x2019;s periphery or the inclusion of extraneous words beyond the sentence, which may cause confusion in clinical documentation. Furthermore, on one hand, Chinese words frequently exhibit polysemy and ambiguity [<xref ref-type="bibr" rid="ref33">33</xref>]; on the other hand, the dataset for this study contains instances where statements are nested within the entity categories of other entities. Such intricacies heighten the risk of erroneous entity category determinations.</p><p>Concurrently, we apply the model introduced in this research to the publicly accessible CCKS2019 dataset, and the findings illustrate its superior performance compared to prior research efforts, suggesting the ChCancerBERT framework has the potential to be extended to other cancer types and even broader clinical domains. Nevertheless, differences in documentation style and terminology across hospitals may affect generalization. Domain adaptation strategies such as continual pretraining on local EHR corpora and transfer learning can be explored to improve portability to other institutions.</p><p>Using our proposed model, vital clinical data encompassing timestamps, examination details, symptoms, and treatment modalities can be readily extracted from the EHRs of patients with cancer. The extracted entities provide important value for both clinical and research applications. Time, symptom, test, and treatment information can be structured into patient timelines to support disease monitoring, clinical decision support, population health analysis, and trial recruitment. These entities can also be mapped to standards such as International Classification of Diseases, 10th Revision, enabling interoperability and secondary use of structured cancer data.</p></sec><sec id="s4-2"><title>Conclusions</title><p>This study designed and developed a pretrained model specifically tailored to the Chinese cancer domain, named ChCancerBERT. The model was then integrated with various NER models proposed in the field of computer science, resulting in the ChCancerBERT-DGCNN-BILSTM-MHA-CRF model. This integrated model was used to extract 4 major types of entities related to cancer from admission records in the oncology department of hospitals. We applied the model proposed in this study to Chinese breast cancer EHR data and CCKS2019 for experimentation and validation. The proposed model achieved an <italic>F</italic><sub>1</sub><italic>-</italic>score of 86.98%, surpassing other models compared in the experiment.</p><p>The primary limitation of this study is the restricted data accessibility and relatively small sample size. This limited sample size may affect the statistical power and robustness of the findings. Moreover, potential variations in clinical documentation practices across different regions or institutions in China could pose challenges to the generalizability of the model. Despite these limitations, it is worth noting that the model demonstrated promising transferable capability by achieving competitive performance on the CCKS2019 medical dataset, which suggests its potential applicability beyond breast cancer data. Therefore, future research should aim to acquire EHR data for other types of cancer to further validate the generalizability of the proposed model. Furthermore, a NER system tailored for clinical application is envisaged to be constructed based on this model. This system aims to assist clinical researchers in data organization and extraction. In addition, plans encompass the establishment of a clinical decision support system to facilitate more effective treatment for patients.</p></sec></sec></body><back><ack><p>This work is supported by Beijing University of Chinese Medicine Independent Topic Selection Young Faculty Project (grant: 040201004002011), 2024 Independent Research Projects for Postgraduate Students of Beijing University of Chinese Medicine (grant: ZJKT2024033), 2022 Basic Research Business Fund &#x2019;Top-Down Leadership&#x2019; Project (grant: 2022JYBJBRW012), the Fundamental Research Funds for the Central Universities (grant: 2025JYBXJSJJ022), Beijing University of Chinese Medicine Educational and Teaching Reform Special Project (grant: XJYZ25010), Beijing Social Science Fund (grant: 24XCC022), Research Funding on TCM Monitoring and Statistics for 2025 of the Monitoring and Statistics Center of the National Administration of Traditional Chinese Medicine (grant: 2025JCTJE50), 'Top-Down Leadership&#x2019; Educational Reform Project of Beijing University of Chinese Medicine (grant: JXJBGS2501005), and the 2025 Basic Research Business Fund &#x2019;Top-Down Leadership&#x2019; Project (grant: 2025JYBJBGS009).</p></ack><notes><sec><title>Data Availability</title><p>The annotated datasets generated and analyzed during the current study are not publicly available due to hospital policy and regulation, but the ChCancerBERT model is available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>JC and SD participated in data collection and collation. JC, BZ, XT, and ZZ participated in the method design, analyzed data, and drafted the initial manuscript. JC, BZ, XT, ZZ, and JW participated in text checking correction and helped to draft the manuscript. FG and RW oversaw and provided input on all aspects of manuscript writing and the final analytical plan. All the authors read and approved the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BiLSTM</term><def><p>bidirectional long short-term memory.</p></def></def-item><def-item><term id="abb2">DGCNN</term><def><p>dilated-gated convolutional neural network</p></def></def-item><def-item><term id="abb3">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb4">ELMo</term><def><p>embeddings from language models</p></def></def-item><def-item><term id="abb5">LSTM</term><def><p>long short-term memory</p></def></def-item><def-item><term id="abb6">MHA</term><def><p>multihead attention</p></def></def-item><def-item><term id="abb7">NER</term><def><p> named entity recognition</p></def></def-item><def-item><term id="abb8">NLP</term><def><p>natural language processing</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deimazar</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sheikhtaheri</surname><given-names>A</given-names> </name></person-group><article-title>Machine learning models to detect and predict patient safety events using electronic health records: a systematic review</article-title><source>Int J Med Inform</source><year>2023</year><month>12</month><volume>180</volume><fpage>105246</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105246</pub-id><pub-id pub-id-type="medline">37837710</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>PE</given-names> </name><name name-style="western"><surname>Hung</surname><given-names>HH</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>WT</given-names> </name><etal/></person-group><article-title>Chinese herbal medicine improved survival in stage IV breast cancer patients: data mining of the Incorporated Taiwan Cancer Registry Database and hospital database</article-title><source>Integr Cancer Ther</source><year>2023</year><volume>22</volume><fpage>15347354231178898</fpage><pub-id pub-id-type="doi">10.1177/15347354231178898</pub-id><pub-id pub-id-type="medline">37278256</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Tong</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Clinical characteristics, treatment patterns, and effectiveness in Chinese patients with angina pectoris using electronic patient-reported outcomes: protocol for a multicenter, prospective, cohort study (GREAT)</article-title><source>Adv Ther</source><year>2023</year><month>04</month><volume>40</volume><issue>4</issue><fpage>1899</fpage><lpage>1912</lpage><pub-id pub-id-type="doi">10.1007/s12325-023-02425-0</pub-id><pub-id pub-id-type="medline">36737594</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abdullah</surname><given-names>MHA</given-names> </name><name name-style="western"><surname>Aziz</surname><given-names>N</given-names> </name><name name-style="western"><surname>Abdulkadir</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Alhussian</surname><given-names>HSA</given-names> </name><name name-style="western"><surname>Talpur</surname><given-names>N</given-names> </name></person-group><article-title>Systematic literature review of information extraction from textual data: recent methods, applications, trends, and challenges</article-title><source>IEEE Access</source><year>2023</year><volume>11</volume><fpage>10535</fpage><lpage>10562</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2023.3240898</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>XH</given-names> </name></person-group><article-title>Chinese clinical named entity recognition with variant neural structures based on BERT methods</article-title><source>J Biomed Inform</source><year>2020</year><month>07</month><volume>107</volume><fpage>103422</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2020.103422</pub-id><pub-id pub-id-type="medline">32353595</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>P</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name></person-group><article-title>Named entity recognition of Chinese electronic medical records based on a hybrid neural network and medical MC-BERT</article-title><source>BMC Med Inform Decis Mak</source><year>2022</year><month>12</month><day>1</day><volume>22</volume><issue>1</issue><fpage>315</fpage><pub-id pub-id-type="doi">10.1186/s12911-022-02059-2</pub-id><pub-id pub-id-type="medline">36457119</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name></person-group><article-title>CancerBERT: a cancer domain-specific language model for extracting breast cancer phenotypes from electronic health records</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>06</month><day>14</day><volume>29</volume><issue>7</issue><fpage>1208</fpage><lpage>1216</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac040</pub-id><pub-id pub-id-type="medline">35333345</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><etal/></person-group><article-title>A cross-institutional evaluation on breast cancer phenotyping NLP algorithms on electronic health records</article-title><source>Comput Struct Biotechnol J</source><year>2023</year><volume>22</volume><fpage>32</fpage><lpage>40</lpage><pub-id pub-id-type="doi">10.1016/j.csbj.2023.08.018</pub-id><pub-id pub-id-type="medline">37680211</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Najafabadipour</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tunas</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Rodriguez-Gonzalez</surname><given-names>A</given-names> </name></person-group><article-title>Lung cancer concept annotation from Spanish clinical narratives</article-title><conf-name>13th International Conference on Data Integration in the Life Sciences 2018 (DILS2018)</conf-name><conf-date>Nov 20-21, 2018</conf-date><pub-id pub-id-type="doi">10.1007/978-3-030-06016-9_15</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yim</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Kwan</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Yetisgen</surname><given-names>M</given-names> </name></person-group><article-title>Tumor reference resolution and characteristic extraction in radiology reports for liver cancer stage prediction</article-title><source>J Biomed Inform</source><year>2016</year><month>12</month><volume>64</volume><fpage>179</fpage><lpage>191</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2016.10.005</pub-id><pub-id pub-id-type="medline">27729234</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savova</surname><given-names>GK</given-names> </name><name name-style="western"><surname>Tseytlin</surname><given-names>E</given-names> </name><name name-style="western"><surname>Finan</surname><given-names>S</given-names> </name><etal/></person-group><article-title>DeepPhe: A natural language processing system for extracting cancer phenotypes from clinical records</article-title><source>Cancer Res</source><year>2017</year><month>11</month><day>1</day><volume>77</volume><issue>21</issue><fpage>e115</fpage><lpage>e118</lpage><pub-id pub-id-type="doi">10.1158/0008-5472.CAN-17-0615</pub-id><pub-id pub-id-type="medline">29092954</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weegar</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kvist</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sundstr&#x00F6;m</surname><given-names>K</given-names> </name><name name-style="western"><surname>Brunak</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dalianis</surname><given-names>H</given-names> </name></person-group><article-title>Finding cervical cancer symptoms in Swedish clinical text using a machine learning approach and NegEx</article-title><source>AMIA Annu Symp Proc</source><year>2015</year><access-date>2025-11-03</access-date><volume>2015</volume><fpage>1296</fpage><lpage>1305</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://pmc.ncbi.nlm.nih.gov/articles/PMC4765575/">https://pmc.ncbi.nlm.nih.gov/articles/PMC4765575/</ext-link></comment><pub-id pub-id-type="medline">26958270</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>An</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>FX</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name></person-group><article-title>Chinese clinical named entity recognition via multi-head self-attention based BiLSTM-CRF</article-title><source>Artif Intell Med</source><year>2022</year><month>05</month><volume>127</volume><fpage>102282</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2022.102282</pub-id><pub-id pub-id-type="medline">35430042</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name></person-group><article-title>Incorporating multi-level CNN and attention mechanism for Chinese clinical named entity recognition</article-title><source>J Biomed Inform</source><year>2021</year><month>04</month><volume>116</volume><fpage>103737</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2021.103737</pub-id><pub-id pub-id-type="medline">33737207</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><name name-style="western"><surname>Long</surname><given-names>J</given-names> </name><name name-style="western"><surname>Qu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name></person-group><article-title>Chinese clinical named entity recognition with ALBERT and MHA mechanism</article-title><source>Evid Based Complement Alternat Med</source><year>2022</year><volume>2022</volume><issue>2056039</issue><fpage>2056039</fpage><pub-id pub-id-type="doi">10.1155/2022/2056039</pub-id><pub-id pub-id-type="medline">35656458</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Ying</surname><given-names>J</given-names> </name></person-group><article-title>A weakly supervised method for named entity recognition of Chinese electronic medical records</article-title><source>Med Biol Eng Comput</source><year>2023</year><month>10</month><volume>61</volume><issue>10</issue><fpage>2733</fpage><lpage>2743</lpage><pub-id pub-id-type="doi">10.1007/s11517-023-02871-6</pub-id><pub-id pub-id-type="medline">37453978</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>Measures for ethical review of life sciences and medical research involving human subjects [Website in Chinese]</article-title><source>National Health Commission</source><year>2023</year><access-date>2025-10-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.gov.cn/zhengce/zhengceku/2023-02/28/content_5743658.htm">https://www.gov.cn/zhengce/zhengceku/2023-02/28/content_5743658.htm</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khan</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A multi-attention approach using BERT and stacked bidirectional LSTM for improved dialogue state tracking</article-title><source>Appl Sci (Basel)</source><year>2023</year><volume>13</volume><issue>3</issue><fpage>1775</fpage><pub-id pub-id-type="doi">10.3390/app13031775</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>J</given-names> </name><name name-style="western"><surname>He</surname><given-names>H</given-names> </name></person-group><article-title>Deep learning&#x2013;based methods for natural hazard named entity recognition</article-title><source>Sci Rep</source><year>2022</year><month>03</month><day>17</day><volume>12</volume><issue>1</issue><fpage>4598</fpage><pub-id pub-id-type="doi">10.1038/s41598-022-08667-2</pub-id><pub-id pub-id-type="medline">35301387</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>Z</given-names> </name></person-group><article-title>Dependency syntax guided BERT-BiLSTM-GAM-CRF for Chinese NER</article-title><source>Expert Syst Appl</source><year>2022</year><month>06</month><volume>196</volume><fpage>116682</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2022.116682</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Runmei</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lulu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Chinese named entity recognition method combining ALBERT and a local adversarial training and adding attention mechanism</article-title><source>Int J Semant Web Inf Syst</source><year>2022</year><month>01</month><volume>18</volume><issue>1</issue><fpage>1</fpage><lpage>20</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://services.igi-global.com/resolvedoi/resolve.aspx?doi=10.4018/IJSWIS.20220101">https://services.igi-global.com/resolvedoi/resolve.aspx?doi=10.4018/IJSWIS.20220101</ext-link></comment><pub-id pub-id-type="doi">10.4018/IJSWIS.313946</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>F</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name></person-group><article-title>Named entity recognition of ancient poems based on Albert&#x2010;BiLSTM&#x2010;MHA&#x2010;CRF model</article-title><source>Wirel Commun Mob Comput</source><year>2022</year><month>01</month><volume>2022</volume><issue>1</issue><fpage>1</fpage><lpage>11</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://onlinelibrary.wiley.com/toc/6302/2022/1">https://onlinelibrary.wiley.com/toc/6302/2022/1</ext-link></comment><pub-id pub-id-type="doi">10.1155/2022/6507719</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>W</given-names> </name></person-group><article-title>Document-level attention-based BiLSTM-CRF incorporating disease dictionary for disease named entity recognition</article-title><source>Comput Biol Med</source><year>2019</year><month>05</month><volume>108</volume><fpage>122</fpage><lpage>132</lpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2019.04.002</pub-id><pub-id pub-id-type="medline">31003175</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>ZC</given-names> </name><name name-style="western"><surname>Li</surname><given-names>JQ</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Akhtar</surname><given-names>F</given-names> </name></person-group><article-title>A dictionary-guided attention network for biomedical named entity recognition in Chinese electronic medical records</article-title><source>Expert Syst Appl</source><year>2023</year><month>11</month><volume>231</volume><fpage>120709</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2023.120709</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Frei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kramer</surname><given-names>F</given-names> </name></person-group><article-title>Annotated dataset creation through large language models for non-English medical NLP</article-title><source>J Biomed Inform</source><year>2023</year><month>09</month><volume>145</volume><fpage>104478</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2023.104478</pub-id><pub-id pub-id-type="medline">37625508</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lingren</surname><given-names>T</given-names> </name><name name-style="western"><surname>Deleger</surname><given-names>L</given-names> </name><name name-style="western"><surname>Molnar</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Evaluating the impact of pre-annotation on annotation speed and potential bias: natural language processing gold standard development for clinical named entity recognition in clinical trial announcements</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>3</issue><fpage>406</fpage><lpage>413</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-001837</pub-id><pub-id pub-id-type="medline">24001514</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Uzuner</surname><given-names>O</given-names> </name><name name-style="western"><surname>Solti</surname><given-names>I</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>F</given-names> </name><name name-style="western"><surname>Cadag</surname><given-names>E</given-names> </name></person-group><article-title>Community annotation experiment for ground truth generation for the i2b2 medication challenge</article-title><source>J Am Med Inform Assoc</source><year>2010</year><volume>17</volume><issue>5</issue><fpage>519</fpage><lpage>523</lpage><pub-id pub-id-type="doi">10.1136/jamia.2010.004200</pub-id><pub-id pub-id-type="medline">20819855</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><article-title>China conference on knowledge graph and semantic computing</article-title><access-date>2025-11-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.sigkg.cn/ccks2019/">https://www.sigkg.cn/ccks2019/</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Hui</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Chinese clinical named entity recognition in electronic medical records: development of a lattice long short-term memory model with contextualized character representations</article-title><source>JMIR Med Inform</source><year>2020</year><month>09</month><day>4</day><volume>8</volume><issue>9</issue><fpage>e19848</fpage><pub-id pub-id-type="doi">10.2196/19848</pub-id><pub-id pub-id-type="medline">32885786</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wan</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>LN</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>B</given-names> </name></person-group><article-title>A self-attention based neural architecture for Chinese medical named entity recognition</article-title><source>Math Biosci Eng</source><year>2020</year><month>05</month><day>9</day><volume>17</volume><issue>4</issue><fpage>3498</fpage><lpage>3511</lpage><pub-id pub-id-type="doi">10.3934/mbe.2020197</pub-id><pub-id pub-id-type="medline">32987540</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name></person-group><article-title>Multi-level semantic fusion network for Chinese medical named entity recognition</article-title><source>J Biomed Inform</source><year>2022</year><month>09</month><volume>133</volume><fpage>104144</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2022.104144</pub-id><pub-id pub-id-type="medline">35878823</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landolsi</surname><given-names>MY</given-names> </name><name name-style="western"><surname>Hlaoua</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ben Romdhane</surname><given-names>L</given-names> </name></person-group><article-title>Information extraction from electronic medical documents: state of the art and future research directions</article-title><source>Knowl Inf Syst</source><year>2023</year><volume>65</volume><issue>2</issue><fpage>463</fpage><lpage>516</lpage><pub-id pub-id-type="doi">10.1007/s10115-022-01779-1</pub-id><pub-id pub-id-type="medline">36405956</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qiu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Chinese engineering geological named entity recognition by fusing multi-features and data enhancement using deep learning</article-title><source>Expert Syst Appl</source><year>2024</year><month>03</month><volume>238</volume><fpage>121925</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2023.121925</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Chinese mineral named entity recognition based on BERT model</article-title><source>Expert Syst Appl</source><year>2022</year><month>11</month><volume>206</volume><fpage>117727</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2022.117727</pub-id></nlm-citation></ref></ref-list></back></article>