<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e57727</article-id><article-id pub-id-type="doi">10.2196/57727</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Semiology Extraction and Machine Learning&#x2013;Based Classification of Electronic Health Records for Patients With Epilepsy: Retrospective Analysis</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Xia</surname><given-names>Yilin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>He</surname><given-names>Mengqiao</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Basang</surname><given-names>Sijia</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sha</surname><given-names>Leihao</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Huang</surname><given-names>Zijie</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jin</surname><given-names>Ling</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Duan</surname><given-names>Yifei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tang</surname><given-names>Yusha</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Hua</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lai</surname><given-names>Wanlin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Chen</surname><given-names>Lei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Neurology, West China Hospital, Sichuan University</institution>, <addr-line>#37 Guoxue Alley, Wuhou District</addr-line><addr-line>Chengdu</addr-line>, <country>China</country></aff><aff id="aff2"><institution>Sichuan Provincial Engineering Research Center of Brain-Machine Interface, and Sichuan Provincial Engineering Research Center of Neuromodulation, West China Hospital, Sichuan University</institution>, <addr-line>Chengdu</addr-line>, <country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lv</surname><given-names>Han</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Xie</surname><given-names>Kevin</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Dadheech</surname><given-names>Pankaj</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Lei Chen, MD, Department of Neurology, West China Hospital, Sichuan University, #37 Guoxue Alley, Wuhou District, Chengdu, China, 86 18980605819; <email>leilei_25@126.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>17</day><month>10</month><year>2024</year></pub-date><volume>12</volume><elocation-id>e57727</elocation-id><history><date date-type="received"><day>25</day><month>02</month><year>2024</year></date><date date-type="rev-recd"><day>23</day><month>08</month><year>2024</year></date><date date-type="accepted"><day>25</day><month>08</month><year>2024</year></date></history><copyright-statement>&#x00A9; Yilin Xia, Mengqiao He, Sijia Basang, Leihao Sha, Zijie Huang, Ling Jin, Yifei Duan, Yusha Tang, Hua Li, Wanlin Lai, Lei Chen. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 17.10.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2024/1/e57727"/><abstract><sec><title>Background</title><p>Obtaining and describing semiology efficiently and classifying seizure types correctly are crucial for the diagnosis and treatment of epilepsy. Nevertheless, there exists an inadequacy in related informatics resources and decision support tools.</p></sec><sec><title>Objective</title><p>We developed a symptom entity extraction tool and an epilepsy semiology ontology (ESO) and used machine learning to achieve an automated binary classification of epilepsy in this study.</p></sec><sec sec-type="methods"><title>Methods</title><p>Using present history data of electronic health records from the Southwest Epilepsy Center in China, we constructed an ESO and a symptom-entity extraction tool to extract seizure duration, seizure symptoms, and seizure frequency from the unstructured text by combining manual annotation with natural language processing techniques. In addition, we achieved automatic classification of patients in the study cohort with high accuracy based on the extracted seizure feature data using multiple machine learning methods.</p></sec><sec sec-type="results"><title>Results</title><p>Data included present history from 10,925 cases between 2010 and 2020. Six annotators labeled a total of 2500 texts to obtain 5844 words of semiology and construct an ESO with 702 terms. Based on the ontology, the extraction tool achieved an accuracy rate of 85% in symptom extraction. Furthermore, we trained a stacking ensemble learning model combining XGBoost and random forest with an <italic>F</italic><sub>1</sub>-score of 75.03%. The random forest model had the highest area under the curve (0.985).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This work demonstrated the feasibility of natural language processing&#x2013;assisted structural extraction of epilepsy medical record texts and downstream tasks, providing open ontology resources for subsequent related work.</p></sec></abstract><kwd-group><kwd>epilepsy</kwd><kwd>natural language processing</kwd><kwd>machine learning</kwd><kwd>electronic health record</kwd><kwd>unstructured text</kwd><kwd>semiology</kwd><kwd>health records</kwd><kwd>retrospective analysis</kwd><kwd>diagnosis</kwd><kwd>treatment</kwd><kwd>decision support tools</kwd><kwd>symptom</kwd><kwd>ontology</kwd><kwd>China</kwd><kwd>Chinese</kwd><kwd>seizure</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Epilepsy is a major chronic neurological disorder that affects approximately 70 million people and severely reduces the quality of life of patients and their families [<xref ref-type="bibr" rid="ref1">1</xref>]. Obtaining a correct and complete seizure semiology efficiently is essential for the diagnosis and classification of seizures. However, this process is difficult to achieve. First, the symptoms of seizures are stereotypical but variable, and the same seizure course is in fact a complex combination of multiple symptomatologic elements in time and space. Furthermore, the type of seizure an individual patient experiences can change over the course of the disease [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Second, seizures have sudden onset, resulting in a short period of time for patients or witnesses to recognize and observe them, and history taking often relies on experienced and careful questioning by epilepsy specialists rather than recording the patient&#x2019;s statements directly [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Finally, epilepsy specialists are scarce and unevenly distributed worldwide. Nonneurologists, medical students, caregivers, and community workers play important roles in epilepsy care but lack appropriate tools to tease out epilepsy histories and determine classifications [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>In recent years, natural language processing (NLP) has been widely used in the structured processing of clinical text data and development of intelligent diagnostic tools in neurology [<xref ref-type="bibr" rid="ref10">10</xref>]. NLP methods have been used to automatically extract details from electronic health records (EHRs) of patients with epilepsy, such as categorical diagnosis, abnormal electroencephalogram (EEG) and imaging results, and medications prescribed [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. These data are also used to accomplish tasks such as automated identification of cohorts of drug-resistant patients and long-term prognostic tracking [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. However, the complexity of epilepsy symptom elements remains a challenge for entity recognition and automatic extraction classification.</p><p>Therefore, ontologies were introduced to address this complexity. The concept of ontology is derived from philosophy and is used for formal, structured, domain-specific, and human- and computer-interpretable representations of entities and relationships. It has been widely used in computers, bioinformatics, and medical informatics [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Application ontology can be used in the medical field to represent established knowledge within a domain and maintain a standardized vocabulary across multiple locations, datasets, and consortiums, allowing for automated computation and decision-making based on structured data. Application ontologies can also be combined with NLP techniques to disambiguate textual concepts and build tools for the knowledge extracted from EHRs [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. This work demonstrated the feasibility of NLP-assisted structural extraction of epilepsy medical record texts and downstream tasks, providing open ontology resources for subsequent related work.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dataset</title><p>Electronic medical record data were obtained from patients with an <italic>International Classification of Diseases, Tenth Revision</italic> (<italic>ICD-10</italic>) epilepsy diagnosis (G40 or G40.x) who were hospitalized at West China Hospital of Sichuan University and assigned an epilepsy diagnosis between 2010 and 2020. The seizure type of inpatients was determined by discharge diagnosis.</p><p>The text information of the current medical history records the details of the occurrence, evolution, diagnosis, and treatment of the patient&#x2019;s disease; is written in chronological order; and is divided into the following parts: onset of the disease, including the time and place of onset; antecedent symptoms; probable causes or triggers; characteristics of the main symptoms and their development and change (describing the location, nature, duration, degree, factors of relief or aggravation, and evolution of the main symptoms in sequential order); accompanying symptoms; diagnosis and treatment since the onset of the disease; and the patient&#x2019;s general condition since the onset of the disease.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The study was reviewed and approved by the Ethics Committee of West China Hospital of Sichuan University (2022(1083)). Since the data were obtained from previous medical records, we have received approval from the ethics committee for a waiver of informed consent. The study data were deidentified, and the privacy and personal information of the subjects were protected.</p></sec><sec id="s2-3"><title>Framework for Standardizing Seizure Information</title><p>We proposed a seizure extraction framework for mining and structuring important information related to seizures from the presenting medical histories of patients with epilepsy (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The framework requires the extraction of the following information:</p><list list-type="order"><list-item><p>Time stamp: The important point in time at which the patient&#x2019;s condition has changed since today.</p></list-item><list-item><p>Location: Seizure site refers to the anatomical parts of the body corresponding to the symptom performance.</p></list-item><list-item><p>Symptom: Symptom performance refers to the symptoms and signs that appear during the seizure.</p></list-item><list-item><p>Duration of seizure event (episode time): Duration of epileptic events within the seizure episode.</p></list-item><list-item><p>Status: Occurrence state refers to the state corresponding to the symptom performance, including &#x201C;with,&#x201D; &#x201C;without,&#x201D; or &#x201C;unknown.&#x201D;</p></list-item><list-item><p>Frequency: The frequency of seizures, for example: once a month, and so forth.</p></list-item></list><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Example of the standardized framework.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e57727_fig01.png"/></fig></sec><sec id="s2-4"><title>Labeling Process</title><p>Six annotators completed the labeling process. Four of them, junior physicians (SB, LS, LJ, and YD) specializing in epilepsy or epilepsy researchers, were responsible for independently extracting seizure-related information from 2500 raw texts of presenting medical histories according to a standardized framework. Two senior physicians (HL and WL) specializing in epilepsy were responsible for discussing and formulating the framework of the annotation and the rules that should be followed during annotation to ensure reliability, providing uniform training to the annotators, and manually reviewing the final results of the annotation. Annotation rules included the following:</p><p>When a particular Chinese phrase used to describe the seizure process was a fixed collocation, the phrase was extracted as a whole without separating the verb and the object (usually a location) in it individually, in order to avoid a decrease in the specificity of the extraction.</p><p>Due to the specificity of the commonly used symptomatology phrases in the Chinese section, it is important to ensure that the symptomatic manifestations are extracted at the coarsest possible granularity, that is, descriptive phrases that include seizure state and seizure site are avoided. However, phrases should not be disassembled when they cannot be clearly recognized as symptoms, such as lip smacking (oropharyngeal automatisms) and hand rubbing (hand automatisms), and the anatomical part of the phrase should be retained. It should also be confirmed that all seizure symptomatology is extracted from seizures and not from other symptoms accompanying epilepsy. Cognitive decline, such as memory and attention, should not be included in labeling.</p><p>Do not standardize the presentation of the extracted information and keep it as original as possible.</p><p>To assess the consistency of the annotations by the 4 annotators, 50 identical medical records were included without their knowledge. Two senior physicians provided reference standards for the annotation of the 50 medical records. We used Fleiss&#x2019;s &#x03BA; to calculate interannotator agreement. By convention, &#x03BA; value above 0.80 indicates &#x201C;near-perfect&#x201D; agreement.</p></sec><sec id="s2-5"><title>Bilingual Ontology Construction for Seizure Semiology</title><p>Compared with other parts of the seizure information framework, epileptic semiology expression and the diversity of expression extraction tasks are more challenging, especially for Chinese EHRs of epilepsy. Therefore, we constructed a bilingual ontology to share the lexicon obtained from manual extraction and annotation. It can be further used, evaluated, and refined for future Chinese epilepsy history extraction tasks.</p><p>We defined the scope of this domain of ontology as epileptic semiology by reference, reused the more authoritative epilepsy-related ontologies and terminology sets as standard terminology, referred to the basic formalized ontology (BFO) as the top-level ontology, and hierarchically arranged the entities according to their domain-neutral framework. Then, we deemphasized the annotated symptoms collected in the annotation phase to eliminate redundancy and placed them into the corresponding terms as their synonymous expression properties. We used Prot&#x00E9;g&#x00E9; as the editor of the ontology and uploaded it in Ontology Web Language (OWL) as the first version of the world&#x2019;s largest ontology browser, BioPortal.</p></sec><sec id="s2-6"><title>Extraction Process and Evaluation of Extraction Results</title><p>We used some NLP tools to structure the extraction of current medical history from EHRs. We imported the organized dictionaries of symptom performance, symptom nature, seizure frequency, and seizure site into the Jieba tokenizer and initialized the Part-of-Speech Tagger (Postagger) and Dependency Parser (Parser) of the pyltp [<xref ref-type="bibr" rid="ref19">19</xref>] plug-in using existing models (pos.model, parser.model). pyltp provides a series of Chinese NLP tools, and users can use these tools for Chinese text segmentation, part-of-speech tagging, parsing, and so on.</p><p>Specifically, in the data preprocessing stage, we first imported organized dictionaries of symptom presentation, symptom type, seizure frequency, and seizure location. These dictionaries are used for subsequent segmentation and feature extraction. We used Postagger to tag the parts of speech of the tokenized results and Parser to analyze the dependency relations of the words in the current sentence or context. Next, we performed text segmentation and annotation, using Jieba Segmenter to segment the medical history text in the EHR. Jieba Segmenter is able to accurately slice and dice the text based on the imported dictionaries. Postagger was called to lexically annotate the segmentation results by identifying the lexical properties of each word. The dependencies between words are analyzed using Parser to determine the syntactic structure between words. Then, to extract symptom information, we iteratively processed the participle results by combining a list of negatives, a list of transitive or logical connectives, and a list of temporal adverbs. These normalized lists allowed us to accurately identify positive and negative symptom information. In each sentence, information such as the location, type, duration, and frequency of symptom episodes was extracted. Finally, the extracted information such as positive and negative symptoms, location, nature, duration, and frequency of episodes was structured and stored in the output dictionary according to the temporal nodes. The overall process flow is illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Extraction modeling workflow.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e57727_fig02.png"/></fig><p>The software and programming languages used included Python 3.8.8, pyltp 0.2.0, pandas 1.4.2, and Jieba 0.42.1.</p><p>After the extraction was completed, we randomly selected 200 cases from all the results for manual inspection to comprehensively assess the extraction capability and obtain the accuracy for 6 aspects separately: time stamp, symptom, location, episode time, status, and frequency.</p></sec><sec id="s2-7"><title>Seizure Classification Based on Machine Learning</title><p>Our work aimed to build a binary classification model capable of distinguishing between generalized and focal seizures. The analysis process, based on supervised machine learning, consisted of the following steps: data preprocessing, feature selection, algorithm selection, parameter tuning, and performance evaluation.</p></sec><sec id="s2-8"><title>Data Preprocessing</title><p>Our extraction tool was used to retrieve semiology data of the patients. After preprocessing 16,587 records by <italic>ICD</italic> coding combined with regular expression matching, 10,098 records were excluded because they did not receive a clear classification (60%).</p><p>A total of 6489 medical history text records with a diagnosis of generalized or focal seizure were retained, including 2632 records of generalized epilepsy and 3857 records of focal epilepsy. After communication with clinicians, 103 symptom words were defined to cover the main symptoms that can occur in patients with epilepsy. We used text-matching techniques to map the symptom descriptions in each record to these 103 symptom words. Specifically, for each record, if a symptom word was mentioned in the text, we marked the corresponding symptom word as 1; if it was not mentioned, it was marked as 0. For example, if a record mentioned &#x201C;Clonic&#x201D; but not &#x201C;Foaming at Mouth,&#x201D; then the field for &#x201C;Clonic&#x201D; was set to 1, and the field for &#x201C;Foaming at Mouth&#x201D; was set to 0.</p></sec><sec id="s2-9"><title>Feature Selection</title><p>We used several feature selection techniques to identify the most relevant features for the classification task. Specifically, we used recursive feature elimination, random forest&#x2013;based feature importance, mutual information, and the SelectKBest method using the ANOVA <italic>F</italic> value. Each method was systematically applied to the feature matrix (X) and the label vector (y) to generate a reduced set of features. We varied the number of retained features (k) across multiple values to evaluate its impact on model performance. In addition, we examined the effects of different sample ratios on the model&#x2019;s performance.</p></sec><sec id="s2-10"><title>Algorithm Selection and Parameter Tuning</title><p>Subsequently, we divided the preprocessed dataset into training and testing sets at a 7:3 ratio. We used 4 types of models as base models: decision tree [<xref ref-type="bibr" rid="ref20">20</xref>], random forest [<xref ref-type="bibr" rid="ref21">21</xref>], XGBoost [<xref ref-type="bibr" rid="ref22">22</xref>], and LightGBM [<xref ref-type="bibr" rid="ref23">23</xref>]. Using grid search algorithms and k-fold cross-validation, we optimized the hyperparameters of the models with training to enhance the model accuracy. Specific parameters are detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We also introduced the stacking ensemble learning method, which was conducted in 2 stages, as illustrated in <xref ref-type="fig" rid="figure3">Figure 3</xref>. In the first stage, we performed 5-fold cross-validation. Specifically, we divided the training dataset into 5 parts, with 4 serving as the training set for base model training and the remaining part serving as the validation set for generating new training data. Simultaneously, we predicted the entire test set (test_data) to create a new test dataset. In the second stage, we used the training and testing sets generated from the first stage as inputs for further training and prediction using the logistic regression model, resulting in the final outcome. In this study, we combined the XGBoost model with the random forest and LightGBM models for combined training and testing.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Stacking integration learning process. EHR: electronic health record.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e57727_fig03.png"/></fig></sec><sec id="s2-11"><title>Performance Evaluation</title><p>Finally, we used the test set to evaluate the precision, recall, <italic>F</italic><sub>1</sub>-scores, and the area under the receiver operating characteristic curve (ROC) value of the model. We designated &#x201C;generalized epilepsy&#x201D; as label A and &#x201C;focal epilepsy&#x201D; as label B. TP(<italic>A</italic>) represents true positives, FP(<italic>A</italic>) represents false positives, and FN(<italic>A</italic>) represents false negatives for label A, and similarly for label B.</p><p>Precision is defined by the following formula:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>A</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>A</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>A</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>B</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>B</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>B</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Recall is defined by the following formula:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>A</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>A</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>A</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>B</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>B</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>B</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></disp-formula><p>The <italic>F</italic><sub>1</sub>-score (<italic>F</italic><sub>1</sub>) is defined by the following formula:</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><p>For the classification analysis of seizure, the following software or programming language versions were used: Python 3.8.8, NumPy 1.24.3, pandas 1.4.2, scikit-learn 1.3.2, XGBoost 2.0.1, and LightGBM 4.1.0.</p></sec><sec id="s2-12"><title>Bilingual Ontology Construction for Seizure Semiology</title><p>Compared with other parts of the seizure information framework, epileptic semiology expression and the diversity of expression extraction tasks are more challenging, especially for Chinese EHRs of epilepsy. Therefore, we constructed a bilingual ontology to share the lexicon obtained from manual extraction and annotation. In developing epilepsy semiology ontology (ESO), we followed 5 of the 7 steps of the Stanford methodology: (1) defining the domain and scope of the ontology, (2) reusing existing ontologies to the extent possible, (3) enumerating ontology terms, (4) defining classes and class hierarchies, and (5) defining class attributes (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p><p>In the first step, epileptologists and the ontology development team met biweekly to define the scope of the ontology and to ensure that the goals remained constant throughout its development. In steps 2 and 3, we standardized terminology by referring to existing, more authoritative epilepsy-related ontologies and terminology sets. In the fourth step, we adopted the BFO as the top-level ontology. In the fifth step, we de-emphasized the annotated symptoms collected in the annotation phase to eliminate redundancy and placed them into the corresponding terms as their synonymous expression properties. Finally, we rendered the ontology using the OWL in the Prot&#x00E9;g&#x00E9; ontology editor and uploaded it to the world&#x2019;s largest ontology browser, Bioportal, as a first version.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Patient Cohort</title><p>The study cohort included 10,925 patients and 10,658 texts of presenting medical histories. The patient cohort included 42% (4588/10,925) females and 58% (6337/10,925) males with a mean age of 31.45 (age range: 1&#x2010;92) years. The presenting medical history texts were independently written and completed by 117 physicians. Fifty-seven percent (6227/10,925) of the patients in the patient cohort ultimately received a definitive diagnostic classification of seizures at the time of discharge, with 32% (1992/6227) of patients having focal epilepsy and 26% (1619/6227) having generalized epilepsy.</p></sec><sec id="s3-2"><title>Assessment of Labeling Quality Control Results and Extraction Capacity</title><p>In the annotation phase, we assigned 50 identical texts to the annotators without their knowledge to test the consistency of their annotations. The &#x03BA;-value of the 4 annotators was 0.862, indicating a high degree of consistency.</p><p>After completing the extraction using the model, we manually inspected a random sample of 200 notes from the extraction results (which included 235 seizures) to assess the extraction performance of the model. The extraction results for the 5 dimensions are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Extraction performance.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Time stamp</td><td align="left" valign="bottom">Location</td><td align="left" valign="bottom">Symptom</td><td align="left" valign="bottom">Episode time</td><td align="left" valign="bottom">Status</td><td align="left" valign="bottom">Frequency</td></tr></thead><tbody><tr><td align="left" valign="top">Total number of elements by reviewer annotation, &#x201C;gold standard&#x201D;</td><td align="left" valign="top">235</td><td align="left" valign="top">512</td><td align="left" valign="top">1325</td><td align="left" valign="top">183</td><td align="left" valign="top">1325</td><td align="left" valign="top">106</td></tr><tr><td align="left" valign="top">Total number of elements by algorithm report</td><td align="left" valign="top">196</td><td align="left" valign="top">516</td><td align="left" valign="top">1219</td><td align="left" valign="top">175</td><td align="left" valign="top">1302</td><td align="left" valign="top">93</td></tr><tr><td align="left" valign="top">Number of correct algorithm-reported elements</td><td align="left" valign="top">181</td><td align="left" valign="top">507</td><td align="left" valign="top">1126</td><td align="left" valign="top">145</td><td align="left" valign="top">1254</td><td align="left" valign="top">84</td></tr><tr><td align="left" valign="top">Recall, n/N (%)</td><td align="left" valign="top">181/235 (77)</td><td align="left" valign="top">507/512 (99)</td><td align="left" valign="top">1126/1325 (85)</td><td align="left" valign="top">145/183 (79)</td><td align="left" valign="top">1254/1325 (95)</td><td align="left" valign="top">84/106 (79)</td></tr><tr><td align="left" valign="top">Precision, n/N (%)</td><td align="left" valign="top">181/196 (92)</td><td align="left" valign="top">507/512 (98)</td><td align="left" valign="top">1126/1219 (92)</td><td align="left" valign="top">145/175 (82)</td><td align="left" valign="top">1254/1302 (96)</td><td align="left" valign="top">84/93 (90)</td></tr><tr><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.88</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.84</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Epilepsy Semiology Ontology</title><p>The overall hierarchical structure of ESO adheres to the architecture of the top-level ontology BFO, which supports semantic interoperability between ontologies, starting from &#x201C;continuant&#x201D; and &#x201C;occurrent&#x201D; under &#x201C;entity.&#x201D;</p><p>The ESO contains a total of 176 terms, most of which are based on the nominal entity &#x201C;anatomical entity&#x201D; and the process &#x201C;physiological pathological process,&#x201D; with a maximum depth of 10 layers. According to the principle of ontology reuse, we partially reused and rearranged the concepts of &#x201C;pathophysiological process&#x201D; and its leaf nodes in epilepsy and seizure ontology (EPSO) [<xref ref-type="bibr" rid="ref24">24</xref>] and also referred to the existing semiology terminology collection of the International League Against Epilepsy, which includes a total of 132 epilepsy semiology terms. In terms of seizure sites, we referred to the &#x201C;Bodily Feature&#x201D; section of Systemized Nomenclature of Medicine Clinical Terms (SNOMED CT) [<xref ref-type="bibr" rid="ref25">25</xref>] and EPSO, which contains a total of 32 seizure-site terms. The purpose, scope, language, and users are listed in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><p>As an important step in implementing the medical record extraction function of the application ontology, we added Chinese translations and synonyms of symptom performance as entity attributes (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). After annotating 2500 medical records, we obtained 5844 words of semiology. After de-emphasizing and removing nonepileptic seizure symptoms (usually abnormal general conditions and comorbid symptoms), we obtained 702 terms, 75 primary terms, and their synonyms. Among them, there were more than 30 synonyms for holding, dropping, and vocalization.</p></sec><sec id="s3-4"><title>Performance of Seizure Classification</title><p>In the feature selection process, we found that choosing 103 features among the 4 feature selection methods gave the best results, and we also observed that choosing different sample ratios for training had little impact on the model performance (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). On this basis, we optimized the parameters and trained 4 foundational models&#x2014;decision tree, random forest, XGBoost, and LightGBM&#x2014;to distinguish between generalized and focal epilepsy. <xref ref-type="fig" rid="figure4">Figure 4A-E</xref> illustrates the contribution of each symptom feature to the predictive decisions of these models. Notably, &#x201C;clonic,&#x201D; &#x201C;tonic,&#x201D; &#x201C;unresponsive to call,&#x201D; &#x201C;eyes rolled up,&#x201D; &#x201C;foaming at mouth,&#x201D; and &#x201C;fall&#x201D; are pivotal in differentiating seizure types.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Distribution of important features of the base model. (A) Decision tree model important features. (B) Random forest model important features. (C) XGBoost model important features. (D) LightGBM model important features. (E) important features of the base model Wayne chart.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e57727_fig04.png"/></fig><p>In addition, we trained a stacking ensemble learning model. As shown in <xref ref-type="fig" rid="figure5">Figure 5A-C</xref>, the stacking ensemble model outperformed the other base models in terms of precision, recall, and <italic>F</italic><sub>1</sub>-score. Among them, the ensemble model combining XGBoost and random forest yielded the best results, with the highest <italic>F</italic><sub>1</sub>-score (75.03%). We also compared the ROCs of the various models represented by different colors. Notably, the random forest model and XGBoost+random forest ensemble model outperformed the other models, as indicated by the orange and blue lines, respectively. As shown in <xref ref-type="fig" rid="figure5">Figure 5D</xref>, the random forest model had the highest area under the curve (AUC)&#x2014;0.984&#x2014;whereas the XGBoost+random forest ensemble model had an AUC of 0.919, with the AUCs of the other models falling below these 2.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Comparison of model evaluations plotted against ROCs. (A) Comparison of precision across models. (B) Comparison of recall across models. (C) Comparison of <italic>F</italic><sub>1</sub>-scores across models. (D) Comparison of ROCs across models. AUC: area under the curve; ROC: receiver operating characteristic curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e57727_fig05.png"/></fig><p>Ultimately, we selected the ensemble model combining XGBoost and random forest for predicting seizure classification and visualized its confusion matrix. As shown in <xref ref-type="fig" rid="figure6">Figure 6</xref>, the model has a precision of 0.68 for predicting &#x201C;generalized epilepsy&#x201D; and a precision of 0.80 for predicting &#x201C;focal epilepsy.&#x201D;</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>XGBoost+random forest confusion matrix plot.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e57727_fig06.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, the first Chinese-English ontology of epilepsy semiology was established, the first non&#x2013;English-structured extraction of epilepsy history text was achieved by combining manual annotation and NLP techniques, and automatic seizure classification was further accomplished based on the data extracted by the tool.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>Ninety percent of the disease burden caused by epilepsy is borne by resource-limited countries. China has more than 12% of patients with epilepsy worldwide [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. The Global Burden of Disease study reported that, in 2019, China&#x2019;s disability-adjusted life years (DALYs) due to epilepsy accounted for 10% of the global DALYs and 94% of the DALYs in East Asia [<xref ref-type="bibr" rid="ref28">28</xref>]. However, the development of Chinese language EHR processing tools for epilepsy has been delayed because of the lack of high-quality corpora such as relevant terminology sets. English ontologies and terminology systems, including SNOMED CT, Unified Medical Language System, and EPSO [<xref ref-type="bibr" rid="ref26">26</xref>], are limited by the problems of diverse descriptions of Chinese medical entities, fuzzy boundaries, and the existence of nested relationships. Therefore, it is more difficult to support clinical terminology extraction from Chinese medical records after &#x201C;Chinese-ization&#x201D; [<xref ref-type="bibr" rid="ref29">29</xref>]. The technical challenges of Chinese NLP lie in its complex word-splitting process, high-frequency ambiguity phenomenon, and flexible and variable sentence construction [<xref ref-type="bibr" rid="ref30">30</xref>]. By contrast, English NLP is relatively simple to process because of its clear separation of words by spaces, more standardized syntactic structures, and abundant processing resources. Despite these differences, the gap between Chinese and English NLP technologies is gradually narrowing as deep learning and pretrained language models continue to advance and multilingual processing capabilities are significantly enhanced. In this study, the ontology and extraction tool constructed based on the corpus of the Southwest Epilepsy Center can better serve the grassroots areas in western China, where the burden of epilepsy is high and medical resources are relatively scarce, thereby bridging the world&#x2019;s health disparities for people with epilepsy [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>In this study, for the first time, the symptom elements of epileptic seizures were extracted at an ultrafine granularity, the accuracy of the extraction of the features reached 0.85, and the classification of generalized and focal seizures relying on the symptom features alone reached an AUC of 0.985. We also found that the key features in the classifier corresponded to the &#x201C;red flag&#x201D; symptoms used by human experts, yielding a list of symptoms including &#x201C;clonic,&#x201D; &#x201C;tonic,&#x201D; &#x201C;unresponsive to call,&#x201D; &#x201C;eyes rolled up,&#x201D; &#x201C;foaming at mouth&#x201D; and &#x201C;fall,&#x201D; which are the same basic key features as those categorized by human experts&#x2019; guidelines [<xref ref-type="bibr" rid="ref2">2</xref>]. To the best of our knowledge, this is the first time that a present history of epilepsy has been extracted and automatically categorized with symptom element granularity [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. Barbour et al [<xref ref-type="bibr" rid="ref34">34</xref>] created regular expressions manually as well as creating false-positive filters and disambiguated them using conditional matching to extract entities such as seizure type, with internally tested <italic>F</italic><sub>1</sub>-values ranging from 0.86 to 0.90. Vulpius et al [<xref ref-type="bibr" rid="ref35">35</xref>] extracted seizure epilepsy types primarily by manually constructing dictionaries.</p><p>However, these 2 studies were based only on existing unstructured diagnostic texts rather than indirect inference through medical history texts, and only automated extraction, rather than automated classification based on symptom features, was achieved. In our seizure classification task, we used a stacking integration technique to combine the XGBoost and random forest models (AUC=0.919). Despite the higher AUC of the random forest model, it may have lower precision or recall in some categories, resulting in a less favorable <italic>F</italic><sub>1</sub>-score than the stacking method. The stacking method, on the other hand, by combining the advantages of both random forest and XGBoost, may achieve a more balanced performance across all categories, thereby improving the <italic>F</italic><sub>1</sub>-score.</p><p>Although downstream tasks for seizure classification currently exist, most rely on a single-model architecture, such as support vector machine, linear model, or XGBoost [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. However, by pooling multiple underlying models using stacking techniques, it is possible to improve model performance and reduce the risk of overfitting, which in turn improves the model&#x2019;s generalization capabilities.</p></sec><sec id="s4-3"><title>Future Directions</title><p>Beyond the initial diagnosis and classification of seizure, our study has the potential to identify specific types of epilepsy. For example, the classification of adolescent myoclonic epilepsy may change over the course of a single patient&#x2019;s illness, with a predominance of absence and myoclonic seizures initially, followed by intensification of generalized tonic-clonic seizures in adulthood or after practice tasks [<xref ref-type="bibr" rid="ref3">3</xref>]. This type of epilepsy is difficult to recognize because of changes from pediatric and adult neurologists. Plug-ins based on extraction and classification models can be developed to alert epileptologists to consider this particular type.</p><p>In addition, accurate extraction of seizure duration and frequency has been used in epilepsy research to help clinical researchers accurately screen retrospective cohorts in vast multicenter electronic health information databases, for example, by accelerating the speed of patient recruitment and data collection, screening of rare epilepsy cohorts [<xref ref-type="bibr" rid="ref37">37</xref>], and screening of persistent status epilepticus in children [<xref ref-type="bibr" rid="ref38">38</xref>]. The extracted data also enable the dynamic and automated monitoring of postmedication efficacy, epidemiological statistics, and medical economics studies on a larger scale. In the future, we will consider the use of deep learning models and the addition of multimodal features such as imaging and EEG in the seizure classification task to achieve a more accurate and dynamically changing classification capability based on the patient&#x2019;s journey. With further improvements in extraction and classification accuracy, automated symptom-based classification will be uniquely suited to help primary care physicians and other specialists accurately classify epilepsy and select appropriate medications. In conclusion, this work demonstrates the feasibility of NLP-assisted structured extraction of epilepsy history text and downstream tasks in Chinese and provides an open ontology resource for subsequent related work.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study also has some limitations. First, including the fact that the data source was only from a single center, we have not yet verified its transferability to other regions in China. Second, we have not yet applied the ontology to real clinical scenarios, such as assisting clinicians in structured and efficient registration of epilepsy history. Third, the accuracy of dependent syntax analysis is crucial to the effectiveness of information extraction, and the flexibility of Chinese grammar adds to the difficulty of the analysis. Fourth, although current deep learning techniques have gained momentum to improve the situation, they also require finer tuning and extensive contextual adaptation testing. Fifth, our ontology remains in its initial iteration. There is currently no systematic approach to quality assessment and verification. We will continue to expand and refine the ontology data. In the future, other dimensions and modalities should be added to the features, including EEG and imaging, to further improve the accuracy of classification and the completion of more downstream tasks.</p></sec><sec id="s4-5"><title>Conclusions</title><p>Clinically significant seizure information was successfully extracted from Chinese medical histories using NLP. This innovative approach represents a powerful tool for clinical research, with numerous potential applications, particularly for disorders characterized by complex clinical symptoms, such as seizure disorders. During this process, we constructed a bilingual ontology of seizure symptomatology comprising 702 terms. Furthermore, leveraging the extracted symptomatology information, we trained a binary classification model for generalized versus focal epilepsy using the stacking ensemble learning method. This demonstrates the feasibility of performing downstream tasks, such as seizure classification, based on the extracted information.</p></sec></sec></body><back><ack><p>We are very grateful to Bairong Shen and Xingyun Liu from the Institute of Systems Genetics of West China Hospital for their guidance on ontology construction. This work was financially supported by TianYuan Special Funds of the National Natural Science Foundation of China (No. 12026607) and Sichuan Science and Technology Program (2023YFS0047).</p></ack><fn-group><fn fn-type="con"><p>YX and LC contributed to study conception and design. YT, HL, and WL participated in data acquisition and curation. SB, LS, LJ, YD, HL, and WL participated in the data labeling process. YX and ZH contributed to ontology construction. YX, MH, SB, and LS participated in the analysis of data and extraction process. YX, MH, and LC contributed to drafting/revision of the manuscript for content.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb2">BFO</term><def><p>basic formalized ontology</p></def></def-item><def-item><term id="abb3">DALYs</term><def><p>disability-adjusted life years</p></def></def-item><def-item><term id="abb4">EEG</term><def><p>electroencephalogram</p></def></def-item><def-item><term id="abb5">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb6">EPSO</term><def><p>epilepsy and seizure ontology</p></def></def-item><def-item><term id="abb7">ESO</term><def><p>epilepsy semiology ontology</p></def></def-item><def-item><term id="abb8"><italic>ICD-10</italic></term><def><p><italic>International Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb9">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb10">OWL</term><def><p>Ontology Web Language</p></def></def-item><def-item><term id="abb11">ROC</term><def><p>receiver operating characteristic curve</p></def></def-item><def-item><term id="abb12">SNOMED CT</term><def><p>Systemized Nomenclature of Medicine Clinical Terms</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thijs</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Surges</surname><given-names>R</given-names> </name><name name-style="western"><surname>O&#x2019;Brien</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Sander</surname><given-names>JW</given-names> </name></person-group><article-title>Epilepsy in adults</article-title><source>Lancet</source><year>2019</year><month>02</month><day>16</day><volume>393</volume><issue>10172</issue><fpage>689</fpage><lpage>701</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(18)32596-0</pub-id><pub-id pub-id-type="medline">30686584</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fisher</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Cross</surname><given-names>JH</given-names> </name><name name-style="western"><surname>D&#x2019;Souza</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Instruction manual for the ILAE 2017 operational classification of seizure types</article-title><source>Epilepsia</source><year>2017</year><month>04</month><volume>58</volume><issue>4</issue><fpage>531</fpage><lpage>542</lpage><pub-id pub-id-type="doi">10.1111/epi.13671</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cerulli Irelli</surname><given-names>E</given-names> </name><name name-style="western"><surname>Morano</surname><given-names>A</given-names> </name><name name-style="western"><surname>Orlando</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Seizure outcome trajectories in a well-defined cohort of newly diagnosed juvenile myoclonic epilepsy patients</article-title><source>Acta Neurol Scand</source><year>2022</year><month>03</month><volume>145</volume><issue>3</issue><fpage>314</fpage><lpage>321</lpage><pub-id pub-id-type="doi">10.1111/ane.13556</pub-id><pub-id pub-id-type="medline">34791656</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wardrope</surname><given-names>A</given-names> </name></person-group><article-title>The promises and pitfalls of seizure phenomenology</article-title><source>Seizure</source><year>2023</year><month>12</month><volume>113</volume><fpage>48</fpage><lpage>53</lpage><pub-id pub-id-type="doi">10.1016/j.seizure.2023.11.008</pub-id><pub-id pub-id-type="medline">37976801</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Muayqil</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Alanazy</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Almalak</surname><given-names>HM</given-names> </name><etal/></person-group><article-title>Accuracy of seizure semiology obtained from first-time seizure witnesses</article-title><source>BMC Neurol</source><year>2018</year><month>09</month><day>1</day><volume>18</volume><issue>1</issue><fpage>135</fpage><pub-id pub-id-type="doi">10.1186/s12883-018-1137-x</pub-id><pub-id pub-id-type="medline">30172251</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patterson</surname><given-names>V</given-names> </name><name name-style="western"><surname>Samant</surname><given-names>S</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>P</given-names> </name><name name-style="western"><surname>Agavane</surname><given-names>V</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>Y</given-names> </name></person-group><article-title>Diagnosis of epileptic seizures by community health workers using a mobile app: a comparison with physicians and a neurologist</article-title><source>Seizure</source><year>2018</year><month>02</month><volume>55</volume><fpage>4</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1016/j.seizure.2017.12.006</pub-id><pub-id pub-id-type="medline">29291457</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goodwin</surname><given-names>M</given-names> </name></person-group><article-title>Do epilepsy specialist nurses use a similar history-taking process as consultant neurologists in the differential diagnosis of patients presenting with a first seizure?</article-title><source>Seizure</source><year>2011</year><month>12</month><volume>20</volume><issue>10</issue><fpage>795</fpage><lpage>800</lpage><pub-id pub-id-type="doi">10.1016/j.seizure.2011.08.003</pub-id><pub-id pub-id-type="medline">21920782</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kakisaka</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Fujikawa</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kitazawa</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nakasato</surname><given-names>N</given-names> </name></person-group><article-title>Teleconference-based education of epileptic seizure semiology</article-title><source>Epilepsy Res</source><year>2018</year><month>09</month><volume>145</volume><fpage>73</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1016/j.eplepsyres.2018.06.007</pub-id><pub-id pub-id-type="medline">29913406</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Benbir</surname><given-names>G</given-names> </name><name name-style="western"><surname>Demiray</surname><given-names>DY</given-names> </name><name name-style="western"><surname>Delil</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yeni</surname><given-names>N</given-names> </name></person-group><article-title>Interobserver variability of seizure semiology between two neurologist and caregivers</article-title><source>Seizure</source><year>2013</year><month>09</month><volume>22</volume><issue>7</issue><fpage>548</fpage><lpage>552</lpage><pub-id pub-id-type="doi">10.1016/j.seizure.2013.04.001</pub-id><pub-id pub-id-type="medline">23611301</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ge</surname><given-names>W</given-names> </name><name name-style="western"><surname>Rice</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Sheikh</surname><given-names>IS</given-names> </name><etal/></person-group><article-title>Improving neurology clinical care with natural language processing tools</article-title><source>Neurology</source><year>2023</year><month>11</month><day>27</day><volume>101</volume><issue>22</issue><fpage>1010</fpage><lpage>1018</lpage><pub-id pub-id-type="doi">10.1212/WNL.0000000000207853</pub-id><pub-id pub-id-type="medline">37816638</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cui</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bozorgi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lhatoo</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>GQ</given-names> </name><name name-style="western"><surname>Sahoo</surname><given-names>SS</given-names> </name></person-group><article-title>EpiDEA: extracting structured epilepsy and seizure information from patient discharge summaries for cohort identification</article-title><source>AMIA Annu Symp Proc</source><year>2012</year><volume>2012</volume><fpage>1191</fpage><lpage>1200</lpage><pub-id pub-id-type="medline">23304396</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maldonado</surname><given-names>R</given-names> </name><name name-style="western"><surname>Harabagiu</surname><given-names>SM</given-names> </name></person-group><article-title>Active deep learning for the identification of concepts and relations in electroencephalography reports</article-title><source>J Biomed Inform</source><year>2019</year><month>10</month><volume>98</volume><fpage>103265</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2019.103265</pub-id><pub-id pub-id-type="medline">31470094</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fonferko-Shadrach</surname><given-names>B</given-names> </name><name name-style="western"><surname>Lacey</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Using natural language processing to extract structured epilepsy data from unstructured clinic letters: development and validation of the ExECT (extraction of epilepsy clinical text) system</article-title><source>BMJ Open</source><year>2019</year><month>04</month><day>1</day><volume>9</volume><issue>4</issue><fpage>e023232</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2018-023232</pub-id><pub-id pub-id-type="medline">30940752</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Castano</surname><given-names>VG</given-names> </name><name name-style="western"><surname>Spotnitz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Waldman</surname><given-names>GJ</given-names> </name><etal/></person-group><article-title>Identification of patients with drug-resistant epilepsy in electronic medical record data using the Observational Medical Outcomes Partnership Common Data Model</article-title><source>Epilepsia</source><year>2022</year><month>11</month><volume>63</volume><issue>11</issue><fpage>2981</fpage><lpage>2993</lpage><pub-id pub-id-type="doi">10.1111/epi.17409</pub-id><pub-id pub-id-type="medline">36106377</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gallagher</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Shinohara</surname><given-names>RT</given-names> </name><etal/></person-group><article-title>Long-term epilepsy outcome dynamics revealed by natural language processing of clinic notes</article-title><source>Epilepsia</source><year>2023</year><month>07</month><volume>64</volume><issue>7</issue><fpage>1900</fpage><lpage>1909</lpage><pub-id pub-id-type="doi">10.1111/epi.17633</pub-id><pub-id pub-id-type="medline">37114472</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lhatoo</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Bernasconi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Blumcke</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Big data in epilepsy: clinical and research considerations. Report from the Epilepsy Big Data Task Force of the International League Against Epilepsy</article-title><source>Epilepsia</source><year>2020</year><month>09</month><volume>61</volume><issue>9</issue><fpage>1869</fpage><lpage>1883</lpage><pub-id pub-id-type="doi">10.1111/epi.16633</pub-id><pub-id pub-id-type="medline">32767763</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Schaub</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Modelling kidney disease using ontology: insights from the Kidney Precision Medicine Project</article-title><source>Nat Rev Nephrol</source><year>2020</year><month>11</month><volume>16</volume><issue>11</issue><fpage>686</fpage><lpage>696</lpage><pub-id pub-id-type="doi">10.1038/s41581-020-00335-w</pub-id><pub-id pub-id-type="medline">32939051</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haendel</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Chute</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>PN</given-names> </name></person-group><article-title>Classification, ontology, and precision medicine</article-title><source>N Engl J Med</source><year>2018</year><month>10</month><day>11</day><volume>379</volume><issue>15</issue><fpage>1452</fpage><lpage>1462</lpage><pub-id pub-id-type="doi">10.1056/NEJMra1615014</pub-id><pub-id pub-id-type="medline">30304648</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Che</surname><given-names>W</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name></person-group><article-title>N-LTP: an open-source neural language technology platform for Chinese</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 24, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2009.11616</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><source>Classification and Regression Trees</source><year>1984</year><edition>1</edition><publisher-name>Routledge</publisher-name></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Random forests</article-title><source>Mach Learn</source><year>2001</year><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><conf-loc>San Francisco, CA</conf-loc><fpage>785</fpage><lpage>794</lpage></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>LightGBM: accelerated genomically designed crop breeding through ensemble learning</article-title><source>Genome Biol</source><year>2021</year><month>09</month><day>20</day><volume>22</volume><issue>1</issue><fpage>271</fpage><pub-id pub-id-type="doi">10.1186/s13059-021-02492-y</pub-id><pub-id pub-id-type="medline">34544450</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sahoo</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Lhatoo</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>DK</given-names> </name><etal/></person-group><article-title>Epilepsy and seizure ontology: towards an epilepsy informatics infrastructure for clinical research and patient care</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>1</issue><fpage>82</fpage><lpage>89</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-001696</pub-id><pub-id pub-id-type="medline">23686934</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>Clinical medicine</article-title><source>ScienceDirect</source><access-date>2024-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.sciencedirect.com/topics/medicine-and-dentistry/clinical-medicine">https://www.sciencedirect.com/topics/medicine-and-dentistry/clinical-medicine</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hao</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Epilepsy centers in China: current status and ways forward</article-title><source>Epilepsia</source><year>2021</year><month>11</month><volume>62</volume><issue>11</issue><fpage>2640</fpage><lpage>2650</lpage><pub-id pub-id-type="doi">10.1111/epi.17058</pub-id><pub-id pub-id-type="medline">34510417</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Prevalence of epilepsy in the People&#x2019;s Republic of China: a systematic review</article-title><source>Epilepsy Res</source><year>2013</year><month>07</month><volume>105</volume><issue>1-2</issue><fpage>195</fpage><lpage>205</lpage><pub-id pub-id-type="doi">10.1016/j.eplepsyres.2013.02.002</pub-id><pub-id pub-id-type="medline">23507331</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>GBD 2019 Diseases and Injuries Collaborators</collab></person-group><article-title>Global burden of 369 diseases and injuries in 204 countries and territories, 1990&#x2013;2019: a systematic analysis for the Global Burden of Disease Study 2019</article-title><source>Lancet</source><year>2020</year><month>10</month><day>17</day><volume>396</volume><issue>10258</issue><fpage>1204</fpage><lpage>1222</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(20)30925-9</pub-id><pub-id pub-id-type="medline">33069326</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rui</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ran</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mi-ye</surname><given-names>W</given-names> </name></person-group><article-title>Enriching plan for Chinese synonyms in medical terms</article-title><source>Chin J Med Libr Inf Sci</source><year>2021</year><volume>30</volume><issue>2</issue><fpage>25</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.3969/j.issn.1671-3982.2021.02.005</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Boer</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Mula</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sander</surname><given-names>JW</given-names> </name></person-group><article-title>The global burden and stigma of epilepsy</article-title><source>Epilepsy Behav</source><year>2008</year><month>05</month><volume>12</volume><issue>4</issue><fpage>540</fpage><lpage>546</lpage><pub-id pub-id-type="doi">10.1016/j.yebeh.2007.12.019</pub-id><pub-id pub-id-type="medline">18280210</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>The competence of village clinicians in the diagnosis and management of childhood epilepsy in Southwestern China and its determinants: a cross-sectional study</article-title><source>Lancet Reg Health West Pac</source><year>2020</year><month>10</month><volume>3</volume><fpage>100031</fpage><pub-id pub-id-type="doi">10.1016/j.lanwpc.2020.100031</pub-id><pub-id pub-id-type="medline">34327383</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Decker</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Turco</surname><given-names>A</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Development of a natural language processing algorithm to extract seizure types and frequencies from the electronic health record</article-title><source>Seizure</source><year>2022</year><month>10</month><volume>101</volume><fpage>48</fpage><lpage>51</lpage><pub-id pub-id-type="doi">10.1016/j.seizure.2022.07.010</pub-id><pub-id pub-id-type="medline">35882104</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gallagher</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Conrad</surname><given-names>EC</given-names> </name><etal/></person-group><article-title>Extracting seizure frequency from epilepsy clinic notes: a machine reading approach to natural language processing</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>04</month><day>13</day><volume>29</volume><issue>5</issue><fpage>873</fpage><lpage>881</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac018</pub-id><pub-id pub-id-type="medline">35190834</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barbour</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hesdorffer</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Automated detection of sudden unexpected death in epilepsy risk factors in electronic medical records using natural language processing</article-title><source>Epilepsia</source><year>2019</year><month>06</month><volume>60</volume><issue>6</issue><fpage>1209</fpage><lpage>1220</lpage><pub-id pub-id-type="doi">10.1111/epi.15966</pub-id><pub-id pub-id-type="medline">31111463</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vulpius</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Werge</surname><given-names>S</given-names> </name><name name-style="western"><surname>J&#x00F8;rgensen</surname><given-names>IF</given-names> </name><etal/></person-group><article-title>Text mining of electronic health records can validate a register-based diagnosis of epilepsy and subgroup into focal and generalized epilepsy</article-title><source>Epilepsia</source><year>2023</year><month>10</month><volume>64</volume><issue>10</issue><fpage>2750</fpage><lpage>2760</lpage><pub-id pub-id-type="doi">10.1111/epi.17734</pub-id><pub-id pub-id-type="medline">37548470</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fernandes</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cardall</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jing</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Identification of patients with epilepsy using automated electronic health records phenotyping</article-title><source>Epilepsia</source><year>2023</year><month>06</month><volume>64</volume><issue>6</issue><fpage>1472</fpage><lpage>1481</lpage><pub-id pub-id-type="doi">10.1111/epi.17589</pub-id><pub-id pub-id-type="medline">36934317</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barbour</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yozawitz</surname><given-names>EG</given-names> </name><etal/></person-group><article-title>Creating rare epilepsy cohorts using keyword search in electronic health records</article-title><source>Epilepsia</source><year>2023</year><month>10</month><volume>64</volume><issue>10</issue><fpage>2738</fpage><lpage>2749</lpage><pub-id pub-id-type="doi">10.1111/epi.17725</pub-id><pub-id pub-id-type="medline">37498137</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chafjiri</surname><given-names>FMA</given-names> </name><name name-style="western"><surname>Reece</surname><given-names>L</given-names> </name><name name-style="western"><surname>Voke</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Natural language processing for identification of refractory status epilepticus in children</article-title><source>Epilepsia</source><year>2023</year><month>12</month><volume>64</volume><issue>12</issue><fpage>3227</fpage><lpage>3237</lpage><pub-id pub-id-type="doi">10.1111/epi.17789</pub-id><pub-id pub-id-type="medline">37804085</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary materials.</p><media xlink:href="medinform_v12i1e57727_app1.docx" xlink:title="DOCX File, 574 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Construction process of epilepsy semiology ontology.</p><media xlink:href="medinform_v12i1e57727_app2.png" xlink:title="PNG File, 152 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Table S1. Purpose, scope, language and users of WWECA.</p><media xlink:href="medinform_v12i1e57727_app3.xlsx" xlink:title="XLSX File, 62 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Model performance with different feature selection methods and sample ratios.</p><media xlink:href="medinform_v12i1e57727_app4.png" xlink:title="PNG File, 1956 KB"/></supplementary-material></app-group></back></article>