<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e60334</article-id><article-id pub-id-type="doi">10.2196/60334</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Chinese Clinical Named Entity Recognition With Segmentation Synonym Sentence Synthesis Mechanism: Algorithm Development and Validation</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Tang</surname><given-names>Jian</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Huang</surname><given-names>Zikun</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Hongzhen</given-names></name><degrees>MMed</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Hao</given-names></name><degrees>MMed</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Huang</surname><given-names>Hailing</given-names></name><degrees>BMed</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tang</surname><given-names>Minqiong</given-names></name><degrees>MMed</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Luo</surname><given-names>Pengsheng</given-names></name><degrees>BMed</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Qin</surname><given-names>Dong</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Pharmacy, People's Hospital of Guilin</institution>, <addr-line>12 Wenming Road</addr-line><addr-line>Guilin</addr-line>, <country>China</country></aff><aff id="aff2"><institution>School of Science and Technology, Guilin University</institution>, <addr-line>Guilin</addr-line>, <country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chrimes</surname><given-names>Dillon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zaghir</surname><given-names>Jamil</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Dong Qin, BSc, Department of Pharmacy, People's Hospital of Guilin, 12 Wenming Road, Guilin, 541000, China, 86 18978320258; <email>qindong025@163.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>21</day><month>11</month><year>2024</year></pub-date><volume>12</volume><elocation-id>e60334</elocation-id><history><date date-type="received"><day>08</day><month>05</month><year>2024</year></date><date date-type="rev-recd"><day>22</day><month>09</month><year>2024</year></date><date date-type="accepted"><day>13</day><month>10</month><year>2024</year></date></history><copyright-statement>&#x00A9; Jian Tang, Zikun Huang, Hongzhen Xu, Hao Zhang, Hailing Huang, Minqiong Tang, Pengsheng Luo, Dong Qin. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 21.11.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2024/1/e60334"/><abstract><sec><title>Background</title><p>Clinical named entity recognition (CNER) is a fundamental task in natural language processing used to extract named entities from electronic medical record texts. In recent years, with the continuous development of machine learning, deep learning models have replaced traditional machine learning and template-based methods, becoming widely applied in the CNER field. However, due to the complexity of clinical texts, the diversity and large quantity of named entity types, and the unclear boundaries between different entities, existing advanced methods rely to some extent on annotated databases and the scale of embedded dictionaries.</p></sec><sec><title>Objective</title><p>This study aims to address the issues of data scarcity and labeling difficulties in CNER tasks by proposing a dataset augmentation algorithm based on proximity word calculation.</p></sec><sec sec-type="methods"><title>Methods</title><p>We propose a Segmentation Synonym Sentence Synthesis (SSSS) algorithm based on neighboring vocabulary, which leverages existing public knowledge without the need for manual expansion of specialized domain dictionaries. Through lexical segmentation, the algorithm replaces new synonymous vocabulary by recombining from vast natural language data, achieving nearby expansion expressions of the dataset. We applied the SSSS algorithm to the Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach (RoBERTa) + conditional random field (CRF) and RoBERTa + Bidirectional Long Short-Term Memory (BiLSTM) + CRF models and evaluated our models (SSSS + RoBERTa + CRF; SSSS + RoBERTa + BiLSTM + CRF) on the China Conference on Knowledge Graph and Semantic Computing (CCKS) 2017 and 2019 datasets.</p></sec><sec sec-type="results"><title>Results</title><p>Our experiments demonstrated that the models SSSS + RoBERTa + CRF and SSSS + RoBERTa + BiLSTM + CRF achieved <italic>F</italic><sub>1</sub>-scores of 91.30% and 91.35% on the CCKS-2017 dataset, respectively. They also achieved <italic>F</italic><sub>1</sub>-scores of 83.21% and 83.01% on the CCKS-2019 dataset, respectively.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The experimental results indicated that our proposed method successfully expanded the dataset and remarkably improved the performance of the model, effectively addressing the challenges of data acquisition, annotation difficulties, and insufficient model generalization performance.</p></sec></abstract><kwd-group><kwd>clinical named entity recognition</kwd><kwd>word embedding</kwd><kwd>Chinese electronic medical records</kwd><kwd>RoBERTa</kwd><kwd>entity recognition</kwd><kwd>segmentation</kwd><kwd>natural language processing</kwd><kwd>AI</kwd><kwd>artificial intelligence</kwd><kwd>dataset</kwd><kwd>dataset augmentation</kwd><kwd>algorithm</kwd><kwd>entity</kwd><kwd>EMR</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Named entity recognition (NER) is an important subtask in natural language processing [<xref ref-type="bibr" rid="ref1">1</xref>]. Its primary function is to identify and classify entities such as diseases in textual data. In the clinical domain, clinical NER (CNER) is used to recognize and classify clinical textual data such as diseases, symptoms, treatments, tests, body parts, and medications in electronic medical records (EMRs) [<xref ref-type="bibr" rid="ref2">2</xref>]. CNER is mission critical for building intelligent medical assistive systems, such as clinical decision support systems, and constructing medical knowledge graphs [<xref ref-type="bibr" rid="ref3">3</xref>]. However, clinical text data are usually unstructured, and clinical text syntax might be incomplete with poor contextualization. Clinical terms may have different meanings in different contexts, and this variability and ambiguity make the identification and classification of named entities extremely challenging, thus making NER in the clinical domain more challenging compared to NER in the general domain [<xref ref-type="bibr" rid="ref4">4</xref>]. Additionally, Chinese EMRs will appear to be more complicated compared to those written in Roman alphabet languages due to the complexity of Chinese grammatical structure and clausal rules [<xref ref-type="bibr" rid="ref5">5</xref>]. With a relatively flexible word order, the subject-verb-object sequence of the Chinese language depends on the emphasis of the content. In contrast, the sentence structure in Roman alphabet languages is relatively fixed, where the word order has minimal impact on semantics. In Chinese, subjects, objects, or other components are frequently omitted, which poses additional challenges for tasks like NER, as this requires interpreting and adding this missing information. In Roman alphabet languages, sentence components are typically expressed explicitly and omissions are less common. Even when omissions do occur, verb conjugations generally provide sufficient contextual clues. In Chinese EMRs, technical terminology and colloquial descriptions are often interwoven, and the frequent use of polysemy and vague expressions further contributes to linguistic diversity and complexity.</p><p>Over the past decade, remarkable advancements have been made in the field of CNER [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Although conventional dictionary-based techniques can identify names and distinct clinical concepts with high accuracy and precision in matching, the quality and size of dictionaries directly impact recognition outcomes. With the development of machine learning, the theoretical basis for several unsupervised learning algorithms revolves around the distributional hypothesis proposed by Zellig Harris [<xref ref-type="bibr" rid="ref9">9</xref>]. This hypothesis posits that words with similar semantic meanings tend to appear in coherent contexts. Consequently, these algorithms assign vector representations to words based on their contextual associations. Two notable examples of such algorithms that use the distributional hypothesis are GloVe and word2vec. Word2vec relies on prediction models, while GloVe is based on count-based calculations.</p><p>CNER presents increased complexity and challenges. This is due to the widespread use of unconventional abbreviations and various representations of the same entities within the Chinese language. These factors greatly impede the accurate and efficient extraction of crucial information. To address this challenge, dictionary-based approaches require a deep understanding and thorough utilization of well-annotated data sources and relevant knowledge bases. This approach enhances model performance and generalizability.</p><p>The adoption of deep learning has led to the emergence of numerous models using a variety of approaches. One such example is the work conducted by Li et al [<xref ref-type="bibr" rid="ref10">10</xref>], who utilized a lattice long short-term memory (LSTM) model incorporating contextualized character representation for recognizing clinical named entities in Chinese. They developed a novel variant of contextualized character representation and incorporated a conditional random field (CRF) layer into their model. Xu et al [<xref ref-type="bibr" rid="ref11">11</xref>] introduced a novel neural network approach referred to as dictionary-attention-Bidirectional LSTM-CRF (Dic-Att-BiLSTM-CRF) for disease NER. Their method involved applying an efficient and precise string-matching technique to identify disease entities with disease dictionaries constructed from the disease ontology. Furthermore, Dic-Att-BiLSTM-CRF created a dictionary attention layer by integrating disease dictionary matching strategies and document-level attention mechanisms. Wang et al [<xref ref-type="bibr" rid="ref12">12</xref>] constructed a dictionary- and context-based approach using medical literature to construct feature vectors for each Chinese character in their proposed combination method of knowledge-driven dictionary methods and data-driven deep learning for NER tasks. The results showed that this approach effectively improved the processing of rare entities; as the size of the dictionary increased, the performance of the method gradually improved.</p><p>Despite significant advancements in these methods, several limitations remain. The performance of these approaches relies to some extent on the annotation and embedding capabilities of the underlying databases [<xref ref-type="bibr" rid="ref13">13</xref>]. Medical datasets often encounter challenges in data collection and annotation, and concerns regarding patient privacy protection and compliance contribute to smaller document collections. Moreover, rarer diseases, drugs, and entities occur less frequently, making it difficult to train models effectively. Few existing methods are universally applicable across diverse datasets, and the generalization performance of the models requires further enhancement due to the peculiarity of medical texts. EMRs abound with ambiguous terms, nonstandard abbreviations, and variations of the same entity, for example, &#x201C;&#x5965;&#x6C99;&#x5229;&#x94C2;(oxaliplatin)&#x201D; and &#x201C;&#x5965;&#x6C99;&#x5229;&#x67CF;(oxaliplatin)&#x201D; [<xref ref-type="bibr" rid="ref14">14</xref>] and &#x201C;&#x5FC3;&#x808C;&#x6897;&#x6B7B;(Myocardial Infarction)&#x201D;and &#x201C;&#x5FC3;&#x808C;&#x6897;&#x585E;(Myocardial Infarction).&#x201D; Doctors&#x2019; writing styles differ significantly, leading to intricate text structures and challenging comprehension. Current NER tasks in the medical domain are primarily focused on Chinese NER, which presents a challenge due to unclear entity boundaries and difficulties in Chinese word segmentation, thereby undermining model performance.</p><p>Based on the above problems, this paper proposes a Segmentation Synonym Sentence Synthesis (SSSS) algorithm based on proximity lexical expressions, which was extensively validated on the China Conference on Knowledge Graph and Semantic Computing (CCKS) 2017 and 2019 datasets. The main contributions of this paper are as follows:</p><list list-type="order"><list-item><p>We propose an adaptive SSSS algorithm for dataset optimization, which exploits existing public knowledge without manually expanding specialized domain dictionaries. It achieved proximity expansion expression of the dataset through lexical cuts, recombined by substituting new proximity repertoires from vast natural language data.</p></list-item><list-item><p>By expanding the proximity vocabulary, our algorithm successfully extended the documents of CCKS-2017 and CCKS-2019 by approximately 17 and 20 times, respectively.</p></list-item><list-item><p>We evaluated the algorithm&#x2019;s performance on CCKS-2017 and CCKS-2019 and achieved relatively competitive results compared to other state-of-the-art models. By extending the proximity vocabulary, our models (SSSS + Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach [RoBERTa] + CRF and SSSS + RoBERTa + Bidirectional Long Short-Term Memory Network [BiLSTM]+ CRF) outperformed both Bidirectional Encoder Representations from Transformers [BERT] + CRF and BERT + BiLSTM + CRF models in handling unknown and low-frequency entities.</p></list-item></list></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Generating an Extended Dataset Based on Proximal Vocabulary</title><p>In our experiment, it was observed that specific entities related to &#x201C;disease&#x201D; and &#x201C;therapy&#x201D; were relatively scarce compared to other categories in the training dataset. This imbalance in entity distribution may weaken the model&#x2019;s effectiveness when dealing with rare or subtle mentions of these topics in the medical field. Additionally, given the complexity and uniqueness of the medical domain, creating comprehensive dictionaries requires substantial engineering efforts and expertise from professionals to ensure smooth execution.</p><p>In this work, we drew inspiration from the concept of proximal lexical expressions [<xref ref-type="bibr" rid="ref15">15</xref>] and proposed a method called SSSS. The implementation of this algorithm involved several steps. First, text segmentation was performed using the Jieba library. Then, based on the natural language word library trained with Word2Vec, synonyms were searched and processed using the Synonyms database. Finally, these identified synonyms were integrated into the original training set at appropriate positions.</p><p>Specifically, when entity X appeared in the training data, we first used the Jieba library to divide it into multiple simple words, such as X1, X2, and X3. If the number of simple words for an entity exceeded 2, we used the edit distance algorithm to search for synonyms related to it in the Synonyms database [<xref ref-type="bibr" rid="ref16">16</xref>]. For example, &#x201C;Norfloxacin&#x201D; can be associated with its synonym &#x201C;Fluoroquinolones,&#x201D; which are different names for the same drug. Additionally, we replaced the original simple words in the processed sentences with the identified synonyms and then reassembled these new complex words to generate synthetic sentences. For instance, after breaking down &#x201C;Pelvic MRI&#x201D; into &#x201C;Pelvis&#x201D; and &#x201C;MRI,&#x201D; we reconstructed them into a sentence using their corresponding synonyms: &#x201C;Pelvic nuclear magnetic resonance examination.&#x201D; Through these steps, our aim was to enhance the diversity and richness of the training data, which may contribute to improving the final model&#x2019;s generalization ability and accuracy. The replaced vocabulary was reintegrated into the surrounding context sentences, aiming to supplement more sentence expressions and vocabulary information without altering the original meaning of the sentences. In similarity calculations, only segmented words were considered; after dimensionality reduction using principal component analysis, they were visualized in a 2D space as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>To improve the generalizability and adaptability of models faced with restricted training datasets, this algorithm explored various synonymous or interchangeable wordings while retaining the primary connotations of words. This strategy enabled the expansion of the training dataset size without the need for additional domain-specific dictionaries, thereby reducing reliance on input from domain experts. Consequently, both the workload of domain expertise personnel and the labeling workforce required for datasets were significantly reduced. By implementing this approach, we utilized the SSSS algorithm to enhance the information and vocabulary within the training set, thereby improving the model&#x2019;s learning ability. <xref ref-type="table" rid="table1">Table 1</xref> presents some examples.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Two-dimensional spatial representation of sample vocabulary.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e60334_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Examples of Segmentation Synonym Sentence Synthesis algorithm expansion.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Entity types</td><td align="left" valign="bottom">Sentence</td><td align="left" valign="bottom">Entity</td><td align="left" valign="bottom">Postexpansion entity</td></tr></thead><tbody><tr><td align="left" valign="top">Body</td><td align="left" valign="top">&#x53F3;&#x624B;&#x4E2D;&#x6307;&#x75BC;&#x75DB;&#x4E0D;&#x9002; (Pain and discomfort in the right middle finger)</td><td align="left" valign="top">&#x53F3;&#x624B;&#x4E2D;&#x6307; (Right middle finger)</td><td align="left" valign="top">&#x53F3;&#x4E2D;&#x6307; (Right middle finger)</td></tr><tr><td align="left" valign="top">Symptom</td><td align="left" valign="top">&#x4E3B;&#x56E0;&#x5934;&#x90E8;&#x5916;&#x4F24;&#x51FA;&#x8840;&#x4F34;&#x5934;&#x660F;3.5&#x5C0F;&#x65F6;&#x5165;&#x9662; (The patient was admitted due to head trauma with bleeding and dizziness for 3.5 hours)</td><td align="left" valign="top">&#x5934;&#x660F; (Dizziness)</td><td align="left" valign="top">&#x5934;&#x6655; (Dizziness)</td></tr><tr><td align="left" valign="top">Exam</td><td align="left" valign="top">&#x5FC3;&#x7535;&#x56FE;, &#x9888;&#x52A8;&#x8109;&#x5F69;&#x8D85;&#x7B49;&#x68C0;&#x67E5; (Electrocardiogram, carotid artery Doppler ultrasound, and other tests)</td><td align="left" valign="top">&#x5FC3;&#x7535;&#x56FE; (Electrocardiogram), &#x9888;&#x52A8;&#x8109;&#x5F69;&#x8D85; (Carotid artery Doppler ultrasound)</td><td align="left" valign="top">&#x5FC3;&#x7535;&#x56FE; (Electrocardiogram), &#x53CC;&#x4FA7;&#x9888;&#x52A8;&#x8109;&#x5F69;&#x8D85; (Bilateral carotid artery Doppler ultrasound)</td></tr><tr><td align="left" valign="top">Treatment</td><td align="left" valign="top">&#x7ED9;&#x4E88;&#x9759;&#x70B9;&#x5934;&#x5B62;&#x54CC;&#x916E;, &#x708E;&#x7425;&#x5B81;&#x8054;&#x5408;&#x6297;&#x611F;&#x67D3; (Administered intravenous cefoperazone and ibuprofen for combined anti-infection treatment)</td><td align="left" valign="top">&#x5934;&#x5B62;&#x54CC;&#x916E; (Cefoperazone), &#x708E;&#x7425;&#x5B81; (Ibuprofen)</td><td align="left" valign="top">&#x5934;&#x5B62;&#x54CC;&#x916E;&#x8212;&#x5DF4;&#x5766;&#x94A0; (Cefoperazone and sulbactam sodium), &#x708E;&#x7425;&#x5B81; (Ibuprofen)</td></tr></tbody></table></table-wrap></sec><sec id="s2-2"><title>Models</title><sec id="s2-2-1"><title>BERT and RoBERTa</title><p>BERT [<xref ref-type="bibr" rid="ref17">17</xref>] is an outstanding pretrained model for text vector representation. Comprising multiple layers of bidirectional transformer encoders, it has the capability to consider the words both before and after a given word, enabling it to ascertain the word&#x2019;s meaning within the context. The structure of the BERT model is illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>. This model is obtained through unsupervised task training on a vast corpus of everyday language. It leverages the self-attention mechanism embedded in its encoder layers to learn enhanced word feature representations, which can be directly applied to downstream tasks. However, due to the less frequent occurrence of medical terms in everyday language corpora and the inclusion of more long-tail vocabulary, such as specialized terminologies, it is essential to conduct secondary training on supervised medical corpora for downstream tasks. RoBERTa [<xref ref-type="bibr" rid="ref18">18</xref>], developed by Facebook, is a derivative version of the original BERT model. It inherits BERT&#x2019;s basic architecture, including stacked transformer layers and bidirectional context encoding. It enhances the training set&#x2019;s variability through dynamic masking in language modeling, improving the model&#x2019;s comprehension abilities. Additionally, RoBERTa uses a larger pretraining dataset and a bigger batch size, resulting in superior performance. It is reasonable to expect that replacing BERT with RoBERTa could lead to even better outcomes.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>BERT and RoBERTa model structure diagram. BERT: Bidirectional Encoder Representations from Transformers; RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e60334_fig02.png"/></fig></sec><sec id="s2-2-2"><title>BiLSTM Model</title><p>The BiLSTM model is a deep learning architecture designed for processing sequential data, achieved by integrating 2 independent BiLSTM networks. Specifically, the BiLSTM model comprises 2 LSTM modules: one reads the sequence from left to right, and the other reads from right to left. Numerous studies have used bidirectional recurrent neural networks to extract local features, integrating them into global information after obtaining the latter using BERT [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. A vector of length T, represented as <inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, &#x2026;, <inline-formula><mml:math id="ieqn3"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, serves as the input to the LSTM units, generating an output sequence of vectors <inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math id="ieqn5"><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, &#x2026;, <inline-formula><mml:math id="ieqn6"><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, all of equal length, through the application of nonlinear transformations learned during the training phase. Each <inline-formula><mml:math id="ieqn7"><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is referred to as the activation of the LSTM at token t. The computational process of neurons in the LSTM is illustrated by <xref ref-type="disp-formula" rid="E1 E2 E3 E4">Equations 1-4</xref>.</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:msub><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced></mml:math></disp-formula><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfenced separators="|"><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>&#x2299;</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2299;</mml:mo><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">h</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></disp-formula><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi> </mml:mi><mml:mi>&#x03C3;</mml:mi><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></disp-formula><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2299;</mml:mo><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">h</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></disp-formula><p>In the equations above, <inline-formula><mml:math id="ieqn8"><mml:mi>W</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn9"><mml:mi>b</mml:mi></mml:math></inline-formula> are trainable parameters, <inline-formula><mml:math id="ieqn10"><mml:mi mathvariant="normal">&#x03C3;</mml:mi></mml:math></inline-formula> represents the element-wise sigmoid function, and <inline-formula><mml:math id="ieqn11"><mml:mo>&#x2299;</mml:mo><mml:mi> </mml:mi></mml:math></inline-formula> is the element-wise product.</p></sec><sec id="s2-2-3"><title>CRF Model</title><p>The CRF model is a machine learning model utilized for processing sequence data, especially in natural language processing. It typically takes a sequence of text as input and generates a corresponding sequence of hidden states as output. In the sequence labeling step of our research, there exists a dependency relationship between adjacent labels. For instance, an inside tag &#x201C;I&#x201D; must follow a beginning tag &#x201C;B.&#x201D; We incorporate a CRF layer following the BERT or BiLSTM layer to compute the optimal sequence combination. This layer considers the dependency relationships between adjacent labels, ensuring that an inside tag &#x201C;I&#x201D; follows a beginning tag &#x201C;B&#x201D; while maintaining a consistent type [<xref ref-type="bibr" rid="ref21">21</xref>]. CRF assumes that a Markov random field has 2 sets of variables, where the X set usually represents a given value, denoting the input sequence, and Y represents the output under the given X condition as the corresponding output label. The graph of a CRF satisfies the following properties.</p><p>When we are under the global condition of <inline-formula><mml:math id="ieqn12"><mml:mi>X</mml:mi></mml:math></inline-formula>, meaning that the value of a random variable in <inline-formula><mml:math id="ieqn13"><mml:mi>X</mml:mi></mml:math></inline-formula> is fixed or given, <inline-formula><mml:math id="ieqn14"><mml:mi>Y</mml:mi></mml:math></inline-formula> follows the Markov property:</p><disp-formula> <label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mi>X</mml:mi></mml:mfrac><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>u</mml:mi><mml:mo>&#x2260;</mml:mo><mml:mi>v</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mi>X</mml:mi></mml:mfrac><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mo>&#x223C;</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn15"><mml:msub><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mo>~</mml:mo><mml:msub><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> indicates that <inline-formula><mml:math id="ieqn16"><mml:msub><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn17"><mml:msub><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are neighbors in the graph.</p></sec></sec><sec id="s2-3"><title>Integration Architecture</title><p>To evaluate the effectiveness of the SSSS algorithm compared to the original dataset, this study integrated and utilized 4 separate models (ie, BERT + CRF, BERT + BiLSTM + CRF, RoBERTa + CRF, and RoBERTa + BiLSTM + CRF). These models have similar structures but were trained using different datasets, masking representations, and training steps during the pretraining phase. The BERT + CRF and BERT + BiLSTM + CRF models have already been proven effective in numerous NER experiments [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], hence they were chosen as comparative baselines for this experiment. The impact of the downstream training set on the experimental results is significant, but the choice of pretraining dataset for the pretrained models also plays a crucial role. To validate this, the study introduced the Chinese BERT model RoBERTa, which uses more Chinese training data for model training. Finally, our model structures were divided into 2 categories, those including BiLSTM and those not including BiLSTM, as shown in <xref ref-type="fig" rid="figure3">Figures 3</xref> and <xref ref-type="fig" rid="figure4">4</xref>, respectively. An ablation study was also conducted on the RoBERTa + CRF and RoBERTa + BiLSTM + CRF models.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>SSSS + RoBERTa + BiLSTM + CRF model structure diagram. CRF: conditional random field; BiLSTM: Bidirectional Long Short-Term Memory; RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach; SSSS: Segmentation Synonym Sentence Synthesis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e60334_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>SSSS + RoBERTa + CRF model structure diagram. CRF: conditional random field; RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach; SSSS: Segmentation Synonym Sentence Synthesis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e60334_fig04.png"/></fig></sec><sec id="s2-4"><title>Parameter Setting</title><p>In this study, beginning, inside, outside tags are utilized to denote entities. Each clinical record may consist of several sentences and treating the record as a whole could result in excessively long samples. Therefore, we separate each record with a Chinese period. All models in this experiment were trained on a 3080 Ti GPU. Common parameter settings for all models were standardized to ensure fairness, utilizing the parameters shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Model parameter settings.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameters</td><td align="left" valign="bottom">Value</td></tr></thead><tbody><tr><td align="left" valign="top">Learning rate of BERT/RoBERTa<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">2&#x00D7;10<sup>&#x2212;5</sup></td></tr><tr><td align="left" valign="top">Learning rate of BiLSTM<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">2&#x00D7;10<sup>&#x2212;5</sup></td></tr><tr><td align="left" valign="top">Learning rate of CRF<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">2&#x00D7;10<sup>&#x2212;3</sup></td></tr><tr><td align="left" valign="top">Max length</td><td align="left" valign="top">256</td></tr><tr><td align="left" valign="top">Batch size</td><td align="left" valign="top">32</td></tr><tr><td align="left" valign="top">Epoch</td><td align="left" valign="top">50</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>BERT/RoBERTa: Bidirectional Encoder Representations from Transformers/Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach</p></fn><fn id="table2fn2"><p><sup>b</sup>BiLSTM: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table2fn3"><p><sup>c</sup>CRF: conditional random field.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-5"><title>Ethical Considerations</title><p>The CCKS-2017 and CCKS-2019 databases used in this study are publicly available and no ethical review was required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Datasets</title><p>This study utilized 2 datasets from the CCKS-2017 CNER and CCKS-2019 CNER tasks, each consisting of training and testing sets. The training sets were used for model training, while the testing sets were used for model evaluation. All data were derived from progress notes and examination results in inpatient EMRs released by the CCKS challenge tasks. CCKS-2017 includes annotations for 5 entity types: symptoms, tests, diagnoses, treatments, and anatomical locations. CCKS-2019 encompasses annotations for 6 entity types: anatomical locations, surgeries, diseases, diagnoses, imaging examinations, medications, and laboratory tests. CCKS-2017 comprises 1559 training instances, while CCKS-2019 comprises 1379 training instances. The original datasets used a JSON structure to annotate the beginning and end of entities, which were then transformed into the beginning, inside, outside annotation scheme for ease of training and testing. The types and quantities of entities in the training datasets are shown in <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Entity distribution in the China Conference on Knowledge Graph and Semantic Computing 2017 dataset.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Type</td><td align="left" valign="bottom">Quantity</td></tr></thead><tbody><tr><td align="left" valign="top">Body</td><td align="left" valign="top">9114</td></tr><tr><td align="left" valign="top">Symptom</td><td align="left" valign="top">8236</td></tr><tr><td align="left" valign="top">Exam</td><td align="left" valign="top">11,163</td></tr><tr><td align="left" valign="top">Disease</td><td align="left" valign="top">1462</td></tr><tr><td align="left" valign="top">Treatment</td><td align="left" valign="top">3260</td></tr></tbody></table></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Entity distribution in the China Conference on Knowledge Graph and Semantic Computing 2019 dataset.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Type</td><td align="left" valign="bottom">Quantity</td></tr></thead><tbody><tr><td align="left" valign="top">Laboratory</td><td align="left" valign="top">1796</td></tr><tr><td align="left" valign="top">Image</td><td align="left" valign="top">1324</td></tr><tr><td align="left" valign="top">Operation</td><td align="left" valign="top">1194</td></tr><tr><td align="left" valign="top">Disease</td><td align="left" valign="top">5540</td></tr><tr><td align="left" valign="top">Drug</td><td align="left" valign="top">2316</td></tr><tr><td align="left" valign="top">Anatomy</td><td align="left" valign="top">11,521</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Evaluation Metrics</title><p>Evaluation metrics are defined by the alignment of true values and predicted results, ensuring consistency in both starting and ending positions as well as correct identification of entity types. In our experiments, we utilized precision, recall, and <italic>F</italic><sub>1</sub>-scores to evaluate the recognition performance of the models; evaluations of all metrics were conducted at the entity level. To validate the feasibility of the SSSS algorithm, we selected dual baselines (BERT + CRF and BERT + BiLSTM + CRF) and dual datasets (CCKS-2017 and CCKS-2019), applying them simultaneously to different datasets and models to achieve cross-validation.</p><p>After applying the SSSS algorithm [<xref ref-type="bibr" rid="ref24">24</xref>], the CCKS-2017 dataset expanded from its original size of 1559 documents to 26,768 entries, representing an expansion of approximately 17 times. Similarly, the CCKS-2019 dataset increased from its original 1379 entries to 28,933 entries, marking an expansion of approximately 20 times. The extent of entity expansion is illustrated in <xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref> below.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Segmentation Synonym Sentence Synthesis algorithm extended effect on the China Conference on Knowledge Graph and Semantic Computing 2017 test set.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Preexpansion</td><td align="left" valign="bottom">Postexpansion</td></tr></thead><tbody><tr><td align="left" valign="top">Body</td><td align="left" valign="top">9114</td><td align="left" valign="top">318,220</td></tr><tr><td align="left" valign="top">Symptom</td><td align="left" valign="top">8236</td><td align="left" valign="top">275,457</td></tr><tr><td align="left" valign="top">Exam</td><td align="left" valign="top">11,163</td><td align="left" valign="top">389,045</td></tr><tr><td align="left" valign="top">Disease</td><td align="left" valign="top">1462</td><td align="left" valign="top">39,599</td></tr><tr><td align="left" valign="top">Treatment</td><td align="left" valign="top">1462</td><td align="left" valign="top">59,852</td></tr></tbody></table></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Segmentation Synonym Sentence Synthesis algorithm extended effect on the China Conference on Knowledge Graph and Semantic Computing 2019 test set.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Preexpansion</td><td align="left" valign="bottom">Postexpansion</td></tr></thead><tbody><tr><td align="left" valign="top">Laboratory</td><td align="left" valign="top">1796</td><td align="left" valign="top">20,270</td></tr><tr><td align="left" valign="top">Image</td><td align="left" valign="top">1324</td><td align="left" valign="top">17,396</td></tr><tr><td align="left" valign="top">Operation</td><td align="left" valign="top">1194</td><td align="left" valign="top">18,662</td></tr><tr><td align="left" valign="top">Disease</td><td align="left" valign="top">5540</td><td align="left" valign="top">77,207</td></tr><tr><td align="left" valign="top">Drug</td><td align="left" valign="top">2316</td><td align="left" valign="top">24,365</td></tr><tr><td align="left" valign="top">Anatomy</td><td align="left" valign="top">11,521</td><td align="left" valign="top">143,332</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Experiment Results</title><p>To demonstrate the effectiveness of the algorithm, we constructed four models: (1) SSSS + BERT + CRF, (2) SSSS + BERT + BiLSTM + CRF, (3) SSSS + RoBERTa + CRF, and (4) SSSS + RoBERTa + BiLSTM + CRF. These were compared with BERT + CRF (baseline 1) and BERT + BiLSTM + CRF (baseline 2). To investigate the impact of SSSS on RoBERTa, we also performed an ablation study on the RoBERTa + CRF and RoBERTa + BiLSTM + CRF models. The results for CCKS-2017 and CCKS-2019 are presented in <xref ref-type="table" rid="table7">Tables 7</xref> and <xref ref-type="table" rid="table8">8</xref>. Specifically, incorporating SSSS into the BERT + CRF and BERT + BiLSTM + CRF models resulted in <italic>F</italic><sub>1</sub> measure increases of 1.97% (compared with baseline 1) and 1.77% (compared with baseline 1), respectively, for CCKS-2017. Switching from BERT to RoBERTa, which includes more Chinese data in its pretraining, led to even more significant improvements. The <italic>F</italic><sub>1</sub>-score of SSSS + RoBERTa + CRF improved by 2.51% (compared with baseline 1) and 2.36% (compared with RoBERTa + CRF), and SSSS + RoBERTa + BiLSTM + CRF improved by 2.37% (compared with baseline 2) and by 1.66% (compared with RoBERTa + BiLSTM + CRF). For CCKS-2019, similar enhancements were observed, with increases of 2.06% (compared with baseline 1) and 2.29% (compared with baseline 2) for SSSS + BERT + CRF and SSSS + BERT + BiLSTM + CRF; 2.62% (compared with baseline 1) and 2.24% (compared with RoBERTa + CRF) for SSSS + RoBERTa + CRF; and 2.44% (compared with baseline 2) and 2.12% (compared with RoBERTa + BiLSTM + CRF) for SSSS + RoBERTa + BiLSTM + CRF.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Results of various methods on the China Conference on Knowledge Graph and Semantic Computing 2017 test set.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Precision, %</td><td align="left" valign="bottom">Recall, %</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score, %</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup> + CRF<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup> (baseline1)</td><td align="left" valign="top">87.61</td><td align="left" valign="top">90.00</td><td align="left" valign="top">88.79</td></tr><tr><td align="left" valign="top">BERT + BiLSTM<sup><xref ref-type="table-fn" rid="table7fn3">c</xref></sup> + CRF (baseline 2)</td><td align="left" valign="top">89.27</td><td align="left" valign="top">88.69</td><td align="left" valign="top">88.98</td></tr><tr><td align="left" valign="top">RoBERTa<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup> + CRF</td><td align="left" valign="top">87.52</td><td align="left" valign="top">90.40</td><td align="left" valign="top">88.94</td></tr><tr><td align="left" valign="top">RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">89.96</td><td align="left" valign="top">89.43</td><td align="left" valign="top">89.69</td></tr><tr><td align="left" valign="top">SSSS<sup><xref ref-type="table-fn" rid="table7fn5">e</xref></sup> + BERT + CRF</td><td align="left" valign="top">91.20</td><td align="left" valign="top">90.33</td><td align="left" valign="top">90.76</td></tr><tr><td align="left" valign="top">SSSS + BERT + BiLSTM + CRF</td><td align="left" valign="top">90.70</td><td align="left" valign="top">90.80</td><td align="left" valign="top">90.75</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + CRF</td><td align="left" valign="top">91.31</td><td align="left" valign="top">91.29</td><td align="left" valign="top">91.30</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">91.22</td><td align="left" valign="top">91.48</td><td align="left" valign="top">91.35</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table7fn2"><p><sup>b</sup>CRF: conditional random field.</p></fn><fn id="table7fn3"><p><sup>c</sup>BiLSTM: Bidirectional Long Short-Term Memory.</p></fn><fn id="table7fn4"><p><sup>d</sup>RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach.</p></fn><fn id="table7fn5"><p><sup>e</sup>SSSS: Segmentation Synonym Sentence Synthesis.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Results of various methods on the China Conference on Knowledge Graph and Semantic Computing 2019 test set.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Precision, %</td><td align="left" valign="bottom">Recall, %</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score, %</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup> + CRF<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup> (baseline 1)</td><td align="left" valign="top">78.43</td><td align="left" valign="top">82.88</td><td align="left" valign="top">80.59</td></tr><tr><td align="left" valign="top">BERT + BiLSTM<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> + CRF (baseline 2)</td><td align="left" valign="top">78.14</td><td align="left" valign="top">83.17</td><td align="left" valign="top">80.57</td></tr><tr><td align="left" valign="top">RoBERTa<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup> + CRF</td><td align="left" valign="top">78.10</td><td align="left" valign="top">84.06</td><td align="left" valign="top">80.97</td></tr><tr><td align="left" valign="top">RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">79.82</td><td align="left" valign="top">82.00</td><td align="left" valign="top">80.89</td></tr><tr><td align="left" valign="top">SSSS<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup> + BERT + CRF</td><td align="left" valign="top">81.08</td><td align="left" valign="top">84.28</td><td align="left" valign="top">82.65</td></tr><tr><td align="left" valign="top">SSSS + BERT + BiLSTM + CRF</td><td align="left" valign="top">81.22</td><td align="left" valign="top">84.57</td><td align="left" valign="top">82.86</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + CRF</td><td align="left" valign="top">81.10</td><td align="left" valign="top">85.46</td><td align="left" valign="top">83.21</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">81.51</td><td align="left" valign="top">84.57</td><td align="left" valign="top">83.01</td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table8fn2"><p><sup>b</sup>CRF: conditional random field.</p></fn><fn id="table8fn3"><p><sup>c</sup>BiLSTM: Bidirectional Long Short-Term Memory.</p></fn><fn id="table8fn4"><p><sup>d</sup>RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach.</p></fn><fn id="table8fn5"><p><sup>e</sup>SSSS: Segmentation Synonym Sentence Synthesis.</p></fn></table-wrap-foot></table-wrap><p>Further analysis across different entity types in both datasets confirmed the comprehensive performance of our models. The experiment results are shown in <xref ref-type="fig" rid="figure5">Figures 5</xref> and <xref ref-type="fig" rid="figure6">6</xref> and <xref ref-type="table" rid="table9">Tables 9</xref> and <xref ref-type="table" rid="table10">10</xref>. In CCKS-2017, all entity types showed improvements in <italic>F</italic><sub>1</sub>-scores after applying the SSSS algorithm. Notably, the body entity type reached an <italic>F</italic><sub>1</sub> score of 88.24% with SSSS + RoBERTa + CRF, marking a 3.45% increase (compared with baseline 1) and 3.71% increase (compared with RoBERTa + CRF). The symptom entity type achieved its highest <italic>F</italic><sub>1</sub>-score at 97.28% with SSSS + RoBERTa + BiLSTM + CRF, improving by 0.92% (compared with baseline 2) and 0.81% (compared with RoBERTa + BiLSTM + CRF). SSSS + RoBERTa + BiLSTM + CRF also led in the exam entity type with an <italic>F</italic><sub>1</sub>-score of 90.51%, representing a 1.5% increase compared with baseline 2 and a 1.02% increase compared with RoBERTa + BiLSTM + CRF. The disease entity type saw their highest <italic>F</italic><sub>1</sub>-score of 88.88% with SSSS + RoBERTa + CRF, increasing by 4.22% (compared with baseline 1) and 2.56% (compared with RoBERTa + CRF). The treatment entity achieved the highest <italic>F</italic><sub>1</sub>-score of 88.38% using SSSS + RoBERTa + CRF, marking an increase of 1.41% (compared with baseline 1) and 2.23% (compared with RoBERTa + CRF). The CCKS-2019 results echoed this pattern of improvement across all entity types. The laboratory, image, operation, disease, drug, and anatomy entity types all saw their best performances with our models, showcasing the effectiveness of the SSSS algorithm in enhancing model accuracy and robustness.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Results of different models on various entity types within the CCKS-2017 test set. BERT: Bidirectional Encoder Representations from Transformers; BiLSTM: Bidirectional Long Short-Term Memory; CCKS: China Conference on Knowledge Graph and Semantic Computing; CRF: conditional random fields; RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach; SSSS: Segmentation Synonym Sentence Synthesis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e60334_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Results of different models on various entity types within the CCKS-2019 test set. BERT: Bidirectional Encoder Representations from Transformers; BiLSTM: Bidirectional Long Short-Term Memory; CCKS: China Conference on Knowledge Graph and Semantic Computing; CRF: conditional random fields; RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach; SSSS: Segmentation Synonym Sentence Synthesis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e60334_fig06.png"/></fig><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Results of entity type on the China Conference on Knowledge Graph and Semantic Computing 2017 test set.</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Body</td><td align="left" valign="bottom">Symptom</td><td align="left" valign="bottom">Exam</td><td align="left" valign="bottom">Disease</td><td align="left" valign="bottom">Treatment</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table9fn1">a</xref></sup> + CRF<sup><xref ref-type="table-fn" rid="table9fn2">b</xref></sup> (baseline 1)</td><td align="left" valign="top">84.79</td><td align="left" valign="top">96.39</td><td align="left" valign="top">86.44</td><td align="left" valign="top">84.66</td><td align="left" valign="top">86.97</td></tr><tr><td align="left" valign="top">BERT + BiLSTM<sup><xref ref-type="table-fn" rid="table9fn3">c</xref></sup> + CRF (baseline 2)</td><td align="left" valign="top">83.68</td><td align="left" valign="top">96.36</td><td align="left" valign="top">89.01</td><td align="left" valign="top">84.56</td><td align="left" valign="top">86.14</td></tr><tr><td align="left" valign="top">RoBERTa<sup><xref ref-type="table-fn" rid="table9fn4">d</xref></sup> + CRF</td><td align="left" valign="top">84.53</td><td align="left" valign="top">96.02</td><td align="left" valign="top">86.73</td><td align="left" valign="top">86.32</td><td align="left" valign="top">86,15</td></tr><tr><td align="left" valign="top">RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">85.34</td><td align="left" valign="top">96.47</td><td align="left" valign="top">89.49</td><td align="left" valign="top">86.68</td><td align="left" valign="top">85.82</td></tr><tr><td align="left" valign="top">SSSS<sup><xref ref-type="table-fn" rid="table9fn5">e</xref></sup> + BERT + CRF</td><td align="left" valign="top">87.01</td><td align="left" valign="top">96.91</td><td align="left" valign="top">89.83</td><td align="left" valign="top">88.25</td><td align="left" valign="top">85.96</td></tr><tr><td align="left" valign="top">SSSS + BERT + BiLSTM + CRF</td><td align="left" valign="top">86.91</td><td align="left" valign="top">97.21</td><td align="left" valign="top">89.42</td><td align="left" valign="top">87.45</td><td align="left" valign="top">88.10</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + CRF</td><td align="left" valign="top">88.24</td><td align="left" valign="top">97.24</td><td align="left" valign="top">89.06</td><td align="left" valign="top">88.88</td><td align="left" valign="top">88.38</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">87.65</td><td align="left" valign="top">97.28</td><td align="left" valign="top">90.51</td><td align="left" valign="top">88.61</td><td align="left" valign="top">87.55</td></tr></tbody></table><table-wrap-foot><fn id="table9fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table9fn2"><p><sup>b</sup>CRF: conditional random field.</p></fn><fn id="table9fn3"><p><sup>c</sup>BiLSTM: Bidirectional Long Short-Term Memory.</p></fn><fn id="table9fn4"><p><sup>d</sup>RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach.</p></fn><fn id="table9fn5"><p><sup>e</sup>SSSS: Segmentation Synonym Sentence Synthesis.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t10" position="float"><label>Table 10.</label><caption><p>Results of entity type on the China Conference on Knowledge Graph and Semantic Computing 2019 test set.</p></caption><table id="table10" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Model</td><td align="left" valign="top">Laboratory</td><td align="left" valign="top">Image</td><td align="left" valign="top">Operation</td><td align="left" valign="top">Disease</td><td align="left" valign="top">Drug</td><td align="left" valign="top">Anatomy</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table10fn1">a</xref></sup> + CRF<sup><xref ref-type="table-fn" rid="table10fn2">b</xref></sup> (baseline 1)</td><td align="left" valign="top">75.85</td><td align="left" valign="top">82.25</td><td align="left" valign="top">84.16</td><td align="left" valign="top">79.39</td><td align="left" valign="top">83.44</td><td align="left" valign="top">81.30</td></tr><tr><td align="left" valign="top">BERT + BiLSTM<sup><xref ref-type="table-fn" rid="table10fn3">c</xref></sup> + CRF (baseline 2)</td><td align="left" valign="top">77.54</td><td align="left" valign="top">82.39</td><td align="left" valign="top">76.71</td><td align="left" valign="top">79.97</td><td align="left" valign="top">82.25</td><td align="left" valign="top">81.36</td></tr><tr><td align="left" valign="top">RoBERTa<sup><xref ref-type="table-fn" rid="table10fn4">d</xref></sup> + CRF</td><td align="left" valign="top">78.70</td><td align="left" valign="top">79.43</td><td align="left" valign="top">79.13</td><td align="left" valign="top">81.02</td><td align="left" valign="top">83.61</td><td align="left" valign="top">81.33</td></tr><tr><td align="left" valign="top">RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">77.65</td><td align="left" valign="top">83.05</td><td align="left" valign="top">80.37</td><td align="left" valign="top">80.38</td><td align="left" valign="top">82.66</td><td align="left" valign="top">81.31</td></tr><tr><td align="left" valign="top">SSSS<sup><xref ref-type="table-fn" rid="table10fn5">e</xref></sup> + BERT + CRF</td><td align="left" valign="top">78.55</td><td align="left" valign="top">84.64</td><td align="left" valign="top">84.79</td><td align="left" valign="top">83.53</td><td align="left" valign="top">86.88</td><td align="left" valign="top">81.77</td></tr><tr><td align="left" valign="top">SSSS + BERT + BiLSTM + CRF</td><td align="left" valign="top">79.78</td><td align="left" valign="top">85.63</td><td align="left" valign="top">82.95</td><td align="left" valign="top">82.40</td><td align="left" valign="top">84.98</td><td align="left" valign="top">83.05</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + CRF</td><td align="left" valign="top">82.05</td><td align="left" valign="top">85.96</td><td align="left" valign="top">84.79</td><td align="left" valign="top">82.79</td><td align="left" valign="top">85.71</td><td align="left" valign="top">82.74</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">84.87</td><td align="left" valign="top">83.85</td><td align="left" valign="top">82.57</td><td align="left" valign="top">81.60</td><td align="left" valign="top">85.03</td><td align="left" valign="top">82.95</td></tr></tbody></table><table-wrap-foot><fn id="table10fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table10fn2"><p><sup>b</sup>CRF: conditional random field.</p></fn><fn id="table10fn3"><p><sup>c</sup>BiLSTM: Bidirectional Long Short-Term Memory.</p></fn><fn id="table10fn4"><p><sup>d</sup>RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach.</p></fn><fn id="table10fn5"><p><sup>e</sup>SSSS: Segmentation Synonym Sentence Synthesis.</p></fn></table-wrap-foot></table-wrap><p>To validate the performance of our model in handling unknown and low-frequency entities, we conducted experiments comparing our models (SSSS + RoBERTa + CRF and SSSS + RoBERTa + BiLSTM + CRF) with BERT + CRF and BERT + BiLSTM + CRF in terms of precision. Entities were categorized based on their occurrence frequency in the training set, as follows:</p><list list-type="order"><list-item><p>Unknown entities: occurrence frequency of 0 in the training set.</p></list-item><list-item><p>Low-frequency entities: occurrence frequency &#x003C;5 times in the training set.</p></list-item><list-item><p>High-frequency entities: occurrence frequency &#x2265;5 times in the training set.</p></list-item></list><p>The comparison results are shown in <xref ref-type="table" rid="table11">Tables 11</xref> and <xref ref-type="table" rid="table12">12</xref>. From the tables, it can be observed that in the CCKS-2017 task, compared to the baseline models, our models SSSS + RoBERTa + CRF and SSSS + RoBERTa + BiLSTM + CRF improved <italic>F</italic><sub>1</sub>-scores for unknown entities by 6.04% (compared with baseline 1) and 5.54% (compared with baseline 2), respectively. For low-frequency entities, the improvements were 7.74% (compared with baseline 1) and 6.39% (compared with baseline 2), respectively. As for high-frequency entities, improvements of 1.96% (compared with baseline 1) and 1.85% (compared with baseline 2) were achieved, respectively. Similar results were obtained in the CCKS-2019 task. Compared with the baseline models, SSSS + RoBERTa + CRF and SSSS + RoBERTa + BiLSTM + CRF achieved improvements of 4.21% (compared with baseline 1) and 2.29% (compared with baseline 2) for unknown entities, respectively, for . For low-frequency entities, improvements of 2.35% (compared with baseline 1) and 6.31% (compared with baseline 2) were achieved, while for high-frequency entities, improvements of 1.09% (compared with baseline 1) and 0.95% (compared with baseline 2) were observed. These results demonstrate significant enhancements in handling unknown and low-frequency entities after expanding the training dataset, with more noticeable improvements observed for low-frequency entities compared to unknown entities.</p><table-wrap id="t11" position="float"><label>Table 11.</label><caption><p>The <italic>F</italic><sub>1</sub>-scores for each method on the China Conference on Knowledge Graph and Semantic Computing 2017 test set.</p></caption><table id="table11" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Unknown entities</td><td align="left" valign="bottom">Low-frequency entities</td><td align="left" valign="bottom">High-frequency entities</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table11fn1">a</xref></sup> + CRF<sup><xref ref-type="table-fn" rid="table11fn2">b</xref></sup> (baseline 1)</td><td align="left" valign="top">40.95</td><td align="left" valign="top">53.43</td><td align="left" valign="top">91.96</td></tr><tr><td align="left" valign="top">BERT + BiLSTM<sup><xref ref-type="table-fn" rid="table11fn3">c</xref></sup> + CRF (baseline 2)</td><td align="left" valign="top">42.59</td><td align="left" valign="top">55.98</td><td align="left" valign="top">92.09</td></tr><tr><td align="left" valign="top">SSSS<sup><xref ref-type="table-fn" rid="table11fn4">d</xref></sup> + RoBERTa<sup><xref ref-type="table-fn" rid="table11fn5">e</xref></sup> + CRF</td><td align="left" valign="top">46.99</td><td align="left" valign="top">61.17</td><td align="left" valign="top">93.92</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">48.13</td><td align="left" valign="top">62.37</td><td align="left" valign="top">93.94</td></tr></tbody></table><table-wrap-foot><fn id="table11fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table11fn2"><p><sup>b</sup>CRF: conditional random field.</p></fn><fn id="table11fn3"><p><sup>c</sup>BiLSTM: Bidirectional Long Short-Term Memory.</p></fn><fn id="table11fn4"><p><sup>d</sup>SSSS: Segmentation Synonym Sentence Synthesis.</p></fn><fn id="table11fn5"><p><sup>e</sup>RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t12" position="float"><label>Table 12.</label><caption><p>The <italic>F</italic><sub>1</sub>-scores for each method on the China Conference on Knowledge Graph and Semantic Computing 2019 test set.</p></caption><table id="table12" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Unknown entities</td><td align="left" valign="bottom">Low-frequency entities</td><td align="left" valign="bottom">High-frequency entities</td></tr></thead><tbody><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table12fn1">a</xref></sup> + CRF<sup><xref ref-type="table-fn" rid="table12fn2">b</xref></sup> (baseline 1)</td><td align="left" valign="top">47.84</td><td align="left" valign="top">63.90</td><td align="left" valign="top">83.65</td></tr><tr><td align="left" valign="top">BERT + BiLSTM<sup><xref ref-type="table-fn" rid="table12fn3">c</xref></sup> + CRF (baseline 2)</td><td align="left" valign="top">45.58</td><td align="left" valign="top">63.59</td><td align="left" valign="top">84.01</td></tr><tr><td align="left" valign="top">SSSS<sup><xref ref-type="table-fn" rid="table12fn4">d</xref></sup> + RoBERTa<sup><xref ref-type="table-fn" rid="table12fn5">e</xref></sup> + CRF</td><td align="left" valign="top">52.05</td><td align="left" valign="top">66.25</td><td align="left" valign="top">84.74</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">47.87</td><td align="left" valign="top">68.68</td><td align="left" valign="top">84.96</td></tr></tbody></table><table-wrap-foot><fn id="table12fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table12fn2"><p><sup>b</sup>CRF: conditional random field..</p></fn><fn id="table12fn3"><p><sup>c</sup>BiLSTM: Bidirectional Long Short-Term Memory.</p></fn><fn id="table12fn4"><p><sup>d</sup>SSSS: Segmentation Synonym Sentence Synthesis.</p></fn><fn id="table12fn5"><p><sup>e</sup>RoBERTa:Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach.</p></fn></table-wrap-foot></table-wrap><p>To demonstrate the superiority of our model, we compared it with existing state-of-the-art models. <xref ref-type="table" rid="table13">Table 13</xref> presents the experimental results of different models on the CCKS-2017 and CCKS-2019 datasets. Our model shows a clear advantage.</p><table-wrap id="t13" position="float"><label>Table 13.</label><caption><p>Comparison of results with existing models on the China Conference on Knowledge Graph and Semantic Computing 2017 and 2019 datasets.</p></caption><table id="table13" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom" colspan="3">2017 dataset</td><td align="left" valign="bottom" colspan="3">2019 dataset</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Precision, %</td><td align="left" valign="bottom">Recall, %</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score, %</td><td align="left" valign="bottom">Precision, %</td><td align="left" valign="bottom">Recall, %</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score, %</td></tr></thead><tbody><tr><td align="left" valign="top">AT<sup><xref ref-type="table-fn" rid="table13fn1">a</xref></sup>-Lattice LSTM<sup><xref ref-type="table-fn" rid="table13fn2">b</xref></sup>-CRF<sup><xref ref-type="table-fn" rid="table13fn3">c</xref></sup> [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">88.98</td><td align="left" valign="top">90.28</td><td align="left" valign="top">89.64</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table13fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">BiLSTM<sup><xref ref-type="table-fn" rid="table13fn5">e</xref></sup>-CRF + Gazetteer + Spatial Attention [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">85.39</td><td align="left" valign="top">87.62</td><td align="left" valign="top">86.49</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">BiLSTM-Att<sup><xref ref-type="table-fn" rid="table13fn6">f</xref></sup>-CRF + POS<sup><xref ref-type="table-fn" rid="table13fn7">g</xref></sup> + Dic<sup><xref ref-type="table-fn" rid="table13fn8">h</xref></sup> [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">90.41</td><td align="left" valign="top">90.49</td><td align="left" valign="top">90.48</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">MCBERT<sup><xref ref-type="table-fn" rid="table13fn9">i</xref></sup>-GCN<sup><xref ref-type="table-fn" rid="table13fn10">j</xref></sup>-CRF [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">83.87</td><td align="left" valign="top">82.26</td><td align="left" valign="top">83.06</td></tr><tr><td align="left" valign="top">SSSS<sup><xref ref-type="table-fn" rid="table13fn11">k</xref></sup> + RoBERTa<sup><xref ref-type="table-fn" rid="table13fn12">l</xref></sup> + CRF</td><td align="left" valign="top">91.31</td><td align="left" valign="top">91.29</td><td align="left" valign="top">91.30</td><td align="left" valign="top">81.10</td><td align="left" valign="top">85.46</td><td align="left" valign="top">83.21</td></tr><tr><td align="left" valign="top">SSSS + RoBERTa + BiLSTM + CRF</td><td align="left" valign="top">91.22</td><td align="left" valign="top">91.48</td><td align="left" valign="top">91.35</td><td align="left" valign="top">81.51</td><td align="left" valign="top">84.57</td><td align="left" valign="top">83.01</td></tr></tbody></table><table-wrap-foot><fn id="table13fn1"><p><sup>a</sup>AT: adversarial training.</p></fn><fn id="table13fn2"><p><sup>b</sup>LSTM: Long Short-Term Memory.</p></fn><fn id="table13fn3"><p><sup>c</sup>CRF: conditional random field.</p></fn><fn id="table13fn4"><p><sup>d</sup>Not applicable.</p></fn><fn id="table13fn5"><p><sup>e</sup>BiLSTM: Bidirectional Long Short-Term Memory.</p></fn><fn id="table13fn6"><p><sup>f</sup>Att: attention.</p></fn><fn id="table13fn7"><p><sup>g</sup>POS: part-of-speech.</p></fn><fn id="table13fn8"><p><sup>h</sup>Dic: dictionary.</p></fn><fn id="table13fn9"><p><sup>i</sup>MCBERT: Medical Chinese Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table13fn10"><p><sup>j</sup>GCN: graph neural network.</p></fn><fn id="table13fn11"><p><sup>k</sup>SSSS: Segmentation Synonym Sentence Synthesis.</p></fn><fn id="table13fn12"><p><sup>l</sup>RoBERTa: Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>We proposed the SSSS algorithm based on neighboring vocabulary to effectively expand the training dataset without introducing additional specialized domain dictionaries, thereby enhancing the model&#x2019;s performance in CNER tasks. The algorithm utilized the Jieba library to tokenize the original entities, then used a natural language vocabulary trained based on Word2Vec and calculated neighboring vocabulary through the Synonyms library to generate more forms of entity expressions, which are integrated into the training set. This approach allowed the model to encounter more diverse forms of entities during training, thereby improving its generalization ability and capability to recognize diverse entities.</p><p>In terms of model structure, this study adopted BERT as the underlying model, combined with the CRF model for sequence labeling tasks, and introduced the BiLSTM model for extracting local features. Experimental results demonstrated that these models achieved significant performance improvement in handling CNER tasks after introducing the SSSS algorithm. The algorithm substantially augmented the dataset, leading to notable enhancements in identifying previously unknown entities and low-frequency entities. Particularly, the improvement in low-frequency entities was substantial, as the generation of expanded entities depends on the decomposition and recombination of existing entities. By splitting and expanding low-frequency entities, their frequencies can be increased, effectively enhancing the model&#x2019;s recognition capabilities for these entities. For example, in the EMR text &#x201C;&#x4F9D;&#x636E;&#x5934;&#x9885; CT&#xFF1A;&#x591A;&#x53D1;&#x8111;&#x6897;&#x6B7B;&#xFF0C;&#x6545;&#x591A;&#x53D1;&#x8111;&#x6897;&#x6B7B;&#x8BCA;&#x65AD;&#x660E;&#x786E; (Based on cranial CT: multiple cerebral infarctions, hence the diagnosis of multiple cerebral infarctions is clear),&#x201D; the disease entity &#x201C;&#x591A;&#x53D1;&#x8111;&#x6897;&#x6B7B; (multiple cerebral infarctions)&#x201D; and the treatment entity &#x201C;&#x5355;&#x785D;&#x9178;&#x5F02;&#x5C71;&#x68A8;&#x916F;&#x6269;&#x51A0; (isosorbide mononitrate vasodilation)&#x201D; in the phrase &#x201C;&#x5355;&#x785D;&#x9178;&#x5F02;&#x5C71;&#x68A8;&#x916F;&#x6269;&#x51A0;&#x6539;&#x5584;&#x5FC3;&#x808C;&#x7F3A;&#x8840; (isosorbide mononitrate vasodilation to improve myocardial ischemia)&#x201D; appeared only once in the original dataset and they were not recognized by the baseline model. However, after SSSS expansion, these entities were successfully identified. For high-frequency entities, such as the cure entities &#x201C;&#x963F;&#x53F8;&#x5339;&#x6797; (Aspirin)&#x201D; and &#x201C;&#x5934;&#x5B62;&#x54CC;&#x916E;&#x94A0;&#x8212;&#x5DF4;&#x5766;&#x94A0; (Cefoperazone Sodium and Sulbactam Sodium)&#x201D; and the disease entity &#x201C;&#x51A0;&#x5FC3;&#x75C5; (coronary heart disease),&#x201D; expansion further increased their occurrence frequency in the training set, improving coverage. However, for previously unknown entities, although some new entities could be generated through the decomposition and expansion of high-frequency and low-frequency entities, their improvement was less than that of low-frequency entities. For example, the body entity &#x201C;&#x53F3;&#x4FA7;&#x80F8;&#x8154; (right pleural cavity)&#x201D; did not exist in the original dataset but was successfully identified through expansion from entities like &#x201C;&#x80F8;&#x8154; (pleural cavity)&#x201D; and &#x201C;&#x5DE6;&#x4FA7;&#x80F8;&#x8154; (left pleural cavity).&#x201D; However, drug entities such as &#x201C;&#x5730;&#x9AD8;&#x8F9B; (digoxin)&#x201D; and &#x201C;&#x683C;&#x5217;&#x672C;&#x8132; (glibenclamide),&#x201D; which were also absent in the original dataset, remained unrecognized even after expansion. This is because it is difficult to create entities that are entirely absent from the original training set but that exist in the medical domain; these entities are far from any entity in the original training set based on the edit distance algorithm. Subsequently, replacing BERT with RoBERTa further improved performance, attributed to RoBERTa&#x2019;s increased use of pretraining data, leading to increased data volume and iteration rounds, thus validating the effectiveness and superiority of the proposed model.</p><p>This study adopted a multibaseline and multidataset cross-experimental method, achieving significant improvements in 2 model structures (BERT + CRF and BERT + BiLSTM + CRF) and 2 datasets (CCKS-2017 and CCKS-2019), demonstrating that the method of expanding the dataset by replacing neighboring vocabulary expressions with new words can effectively improve the accuracy and recall of the model on vocabulary in different models.</p></sec><sec id="s4-2"><title>Limitations and Future Work</title><p>The increase in training time due to the expansion of vocabulary expressions varies. Moreover, it can be observed that in the CCKS-2019 task, the use of the expanded dataset for anatomical entities was improved but still did not reach the average level. This may be because anatomical entities often appear mixed in surgical or disease and diagnosis entities. Additionally, since the algorithm did not introduce additional domain dictionaries, there are still shortcomings in the expansion method for discovering new unknown entities. Due to the extensive expansion of domain-specific vocabulary, it may be difficult to ensure that the restructured sentences fully retain the original meaning. With the rapid development of medical information, EMR text data are becoming increasingly extensive and complex, resulting in higher requirements for the performance and efficiency of models. In future research, further combining small-scale domain dictionaries to enhance the coverage of unknown entities&#x2014;or using techniques such as random word replacement with MacBERT or Chinese word embeddings with BERT-wwm&#x2014;while addressing issues like nested anatomical entities and Chinese word segmentation ambiguities remains a direction that requires continued exploration and investigation.</p></sec><sec id="s4-3"><title>Conclusion</title><p>This study introduces an adaptive dataset optimization algorithm named SSSS, which is based on the utilization of nearby vocabulary expressions. The algorithm was extensively validated using the CCKS-2017 and CCKS-2019 datasets. We leveraged existing public knowledge, eliminating the need for manual expansion of specialized domain dictionaries. By segmenting the existing vocabulary and replacing it with new synonyms from the large natural language database word2vec, we achieved the recombination of the datasets&#x2019; nearby expanded expressions. Experimental results demonstrated that our algorithm successfully expanded the documents of CCKS-2017 and CCKS-2019 by approximately 17 times and 20 times, effectively addressing challenges such as data acquisition, annotation difficulties, and insufficient model generalization performance.</p><p>In terms of performance evaluation, when compared to the basic BERT + CRF and BERT + BiLSTM + CRF models, our model improved <italic>F</italic><sub>1</sub>-scores by 2.51% and 2.37% in the CCKS-2017 task, and achieved an increase of 2.62% and 2.44% in <italic>F</italic><sub>1</sub>-scores in the CCKS-2019 task. Furthermore, through the expansion of nearby vocabulary, our model outperformed BERT + CRF and BERT + BiLSTM + CRF in handling unknown entities and low-frequency entities. This provides a novel approach for addressing challenges in CNER tasks, such as the unstructured nature of clinical text, poor contextual association, and difficulties in annotation.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb2">BiLSTM</term><def><p>Bidirectional Long Short-Term Memory</p></def></def-item><def-item><term id="abb3">CCKS</term><def><p>China Conference on Knowledge Graph and Semantic Computing</p></def></def-item><def-item><term id="abb4">CNER</term><def><p>clinical named entity recognition</p></def></def-item><def-item><term id="abb5">CRF</term><def><p>conditional random field</p></def></def-item><def-item><term id="abb6">Dic-Att-BiLSTM-CRF</term><def><p>dictionary-attention-Bidirectional Long Short-Term Memory-conditional random field</p></def></def-item><def-item><term id="abb7">EMR</term><def><p>electronic medical record</p></def></def-item><def-item><term id="abb8">LSTM</term><def><p>Long Short-Term Memory</p></def></def-item><def-item><term id="abb9">NER</term><def><p>named entity recognition</p></def></def-item><def-item><term id="abb10">RoBERTa</term><def><p>Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach</p></def></def-item><def-item><term id="abb11">SSSS</term><def><p>Segmentation Synonym Sentence Synthesis</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>G</given-names> </name><name name-style="western"><surname>Rong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ouyang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xiong</surname><given-names>Z</given-names> </name></person-group><article-title>External features enriched model for biomedical question answering</article-title><source>BMC Bioinformatics</source><year>2021</year><month>05</month><day>26</day><volume>22</volume><issue>1</issue><fpage>272</fpage><pub-id pub-id-type="doi">10.1186/s12859-021-04176-7</pub-id><pub-id pub-id-type="medline">34039273</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>K</given-names> </name></person-group><article-title>Entity recognition of Chinese medical text based on multi-head self-attention combined with BILSTM-CRF</article-title><source>Math Biosci Eng</source><year>2022</year><month>01</month><day>4</day><volume>19</volume><issue>3</issue><fpage>2206</fpage><lpage>2218</lpage><pub-id pub-id-type="doi">10.3934/mbe.2022103</pub-id><pub-id pub-id-type="medline">35240782</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chapman</surname><given-names>WW</given-names> </name><name name-style="western"><surname>McDonald</surname><given-names>CJ</given-names> </name></person-group><article-title>What can natural language processing do for clinical decision support?</article-title><source>J Biomed Inform</source><year>2009</year><month>10</month><volume>42</volume><issue>5</issue><fpage>760</fpage><lpage>772</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2009.08.007</pub-id><pub-id pub-id-type="medline">19683066</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Uzuner</surname><given-names>&#x00D6;</given-names> </name><name name-style="western"><surname>South</surname><given-names>BR</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>S</given-names> </name><name name-style="western"><surname>DuVall</surname><given-names>SL</given-names> </name></person-group><article-title>2010 i2b2/VA challenge on concepts, assertions, and relations in clinical text</article-title><source>J Am Med Inform Assoc</source><year>2011</year><volume>18</volume><issue>5</issue><fpage>552</fpage><lpage>556</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000203</pub-id><pub-id pub-id-type="medline">21685143</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duan</surname><given-names>H</given-names> </name></person-group><article-title>A study on features of the CRFs&#x2010;based Chinese named entity recognition</article-title><source>Int J Adv Intell Paradigms</source><year>2011</year><access-date>2024-11-16</access-date><volume>3</volume><issue>2</issue><fpage>287</fpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.semanticscholar.org/paper/A-Study-on-Features-of-the-CRFs-based-Chinese-Named-Duan-Zheng/a874006d45beb668603e382a7fcf29f6cfe6baec">https://www.semanticscholar.org/paper/A-Study-on-Features-of-the-CRFs-based-Chinese-Named-Duan-Zheng/a874006d45beb668603e382a7fcf29f6cfe6baec</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shaitarova</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zaghir</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lavelli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Krauthammer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rinaldi</surname><given-names>F</given-names> </name></person-group><article-title>Exploring the latest highlights in medical natural language processing across multiple languages: a survey</article-title><source>Yearb Med Inform</source><year>2023</year><month>08</month><volume>32</volume><issue>1</issue><fpage>230</fpage><lpage>243</lpage><pub-id pub-id-type="doi">10.1055/s-0043-1768726</pub-id><pub-id pub-id-type="medline">38147865</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>N&#x00E9;v&#x00E9;ol</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dalianis</surname><given-names>H</given-names> </name><name name-style="western"><surname>Velupillai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zweigenbaum</surname><given-names>P</given-names> </name></person-group><article-title>Clinical natural language processing in languages other than English: opportunities and challenges</article-title><source>J Biomed Semantics</source><year>2018</year><month>03</month><day>30</day><volume>9</volume><issue>1</issue><fpage>12</fpage><pub-id pub-id-type="doi">10.1186/s13326-018-0179-8</pub-id><pub-id pub-id-type="medline">29602312</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fraile Navarro</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ijaz</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rezazadegan</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Clinical named entity recognition and relation extraction using natural language processing of medical free text: a systematic review</article-title><source>Int J Med Inform</source><year>2023</year><month>09</month><volume>177</volume><fpage>105122</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105122</pub-id><pub-id pub-id-type="medline">37295138</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Firth</surname><given-names>JR</given-names> </name></person-group><source>A Synopsis of Linguistic Theory, 1930-1955</source><year>1957</year><publisher-name>Blackwell</publisher-name></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Hui</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Chinese clinical named entity recognition in electronic medical records: development of a lattice long short-term memory model with contextualized character representations</article-title><source>JMIR Med Inform</source><year>2020</year><month>09</month><day>4</day><volume>8</volume><issue>9</issue><fpage>e19848</fpage><pub-id pub-id-type="doi">10.2196/19848</pub-id><pub-id pub-id-type="medline">32885786</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>W</given-names> </name></person-group><article-title>Document-level attention-based BiLSTM-CRF incorporating disease dictionary for disease named entity recognition</article-title><source>Comput Biol Med</source><year>2019</year><month>05</month><volume>108</volume><fpage>122</fpage><lpage>132</lpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2019.04.002</pub-id><pub-id pub-id-type="medline">31003175</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ruan</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>D</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>Y</given-names> </name><name name-style="western"><surname>He</surname><given-names>P</given-names> </name></person-group><article-title>Incorporating dictionaries into deep neural networks for the Chinese clinical named entity recognition</article-title><source>J Biomed Inform</source><year>2019</year><month>04</month><volume>92</volume><fpage>103133</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2019.103133</pub-id><pub-id pub-id-type="medline">30818005</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cook</surname><given-names>HV</given-names> </name><name name-style="western"><surname>Jensen</surname><given-names>LJ</given-names> </name></person-group><article-title>A guide to dictionary-based text mining</article-title><source>Methods Mol Biol</source><year>2019</year><volume>1939</volume><fpage>73</fpage><lpage>89</lpage><pub-id pub-id-type="doi">10.1007/978-1-4939-9089-4_5</pub-id><pub-id pub-id-type="medline">30848457</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Dash</surname><given-names>S</given-names> </name><name name-style="western"><surname>Acharya</surname><given-names>BR</given-names> </name><name name-style="western"><surname>Mittal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abraham</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kelemen</surname><given-names>A</given-names> </name></person-group><source>Deep Learning Techniques for Biomedical and Health Informatics</source><year>2020</year><publisher-name>Springer</publisher-name><pub-id pub-id-type="other">3030339661</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Soriano</surname><given-names>IM</given-names> </name><name name-style="western"><surname>Pe&#x00F1;a</surname><given-names>JLC</given-names> </name></person-group><article-title>STMC: semantic tag medical concept using word2vec representation</article-title><conf-name>2018 IEEE 31st International Symposium on Computer-Based Medical Systems</conf-name><conf-date>Jun 18-21, 2018</conf-date><conf-loc>Karlstad, Sweden</conf-loc><pub-id pub-id-type="doi">10.1109/CBMS.2018.00075</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Usino</surname><given-names>W</given-names> </name><name name-style="western"><surname>Satria</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hamed</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bramantoro</surname><given-names>A</given-names> </name><name name-style="western"><surname>A</surname><given-names>H</given-names> </name><name name-style="western"><surname>Amaldi</surname><given-names>W</given-names> </name></person-group><article-title>Document similarity detection using k-means and cosine distance</article-title><source>IJACSA</source><year>2019</year><volume>10</volume><issue>2</issue><pub-id pub-id-type="doi">10.14569/IJACSA.2019.0100222</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>ArXiv</source><access-date>2024-11-01</access-date><comment>Preprint posted online on  Oct 11, 2018</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1810.04805">https://arxiv.org/abs/1810.04805</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><etal/></person-group><article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title><source>ArXiv</source><access-date>2024-11-01</access-date><comment>Preprint posted online on  Jul 26, 2019</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1907.11692">https://arxiv.org/abs/1907.11692</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>X</given-names> </name><etal/></person-group><article-title>KeMRE: knowledge-enhanced medical relation extraction for Chinese medicine instructions</article-title><source>J Biomed Inform</source><year>2021</year><month>08</month><volume>120</volume><fpage>103834</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2021.103834</pub-id><pub-id pub-id-type="medline">34119692</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Chinese-named entity recognition from adverse drug event records: radical embedding-combined dynamic embedding-based BERT in a bidirectional long short-term conditional random field (Bi-LSTM-CRF) model</article-title><source>JMIR Med Inform</source><year>2021</year><month>12</month><day>1</day><volume>9</volume><issue>12</issue><fpage>e26407</fpage><pub-id pub-id-type="doi">10.2196/26407</pub-id><pub-id pub-id-type="medline">34855616</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Use of BERT (bidirectional encoder representations from transformers)-based deep learning method for extracting evidences in Chinese radiology reports: development of a computer-aided liver cancer diagnosis framework</article-title><source>J Med Internet Res</source><year>2021</year><month>01</month><day>12</day><volume>23</volume><issue>1</issue><fpage>e19689</fpage><pub-id pub-id-type="doi">10.2196/19689</pub-id><pub-id pub-id-type="medline">33433395</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Extracting comprehensive clinical information for breast cancer using deep learning methods</article-title><source>Int J Med Inform</source><year>2019</year><month>12</month><volume>132</volume><fpage>103985</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2019.103985</pub-id><pub-id pub-id-type="medline">31627032</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>B</given-names> </name></person-group><article-title>Research on named entity recognition of Traditional Chinese Medicine chest discomfort cases incorporating domain vocabulary features</article-title><source>Comput Biol Med</source><year>2023</year><month>11</month><volume>166</volume><fpage>107466</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107466</pub-id><pub-id pub-id-type="medline">37742417</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Huangzikun/ccks-ssss</article-title><source>GitHub</source><access-date>2024-11-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/Huangzikun/ccks-ssss">https://github.com/Huangzikun/ccks-ssss</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>A</given-names> </name></person-group><article-title>Adversarial training based lattice LSTM for Chinese clinical named entity recognition</article-title><source>J Biomed Inform</source><year>2019</year><month>11</month><volume>99</volume><fpage>103290</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2019.103290</pub-id><pub-id pub-id-type="medline">31557528</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Du</surname><given-names>G</given-names> </name><name name-style="western"><surname>Xiang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Towards Chinese clinical named entity recognition by dynamic embedding using domain-specific knowledge</article-title><source>J Biomed Inform</source><year>2020</year><month>06</month><volume>106</volume><fpage>103435</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2020.103435</pub-id><pub-id pub-id-type="medline">32360988</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>F</given-names> </name></person-group><article-title>An attention-based deep learning model for clinical named entity recognition of Chinese electronic medical records</article-title><source>BMC Med Inform Decis Mak</source><year>2019</year><month>12</month><day>5</day><volume>19</volume><issue>Suppl 5</issue><fpage>235</fpage><pub-id pub-id-type="doi">10.1186/s12911-019-0933-6</pub-id><pub-id pub-id-type="medline">31801540</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ying</surname><given-names>J</given-names> </name></person-group><article-title>A weakly supervised method for named entity recognition of Chinese electronic medical records</article-title><source>Med Biol Eng Comput</source><year>2023</year><month>10</month><volume>61</volume><issue>10</issue><fpage>2733</fpage><lpage>2743</lpage><pub-id pub-id-type="doi">10.1007/s11517-023-02871-6</pub-id><pub-id pub-id-type="medline">37453978</pub-id></nlm-citation></ref></ref-list></back></article>