<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e73884</article-id><article-id pub-id-type="doi">10.2196/73884</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Process for Quality Management of Electronic Medical Records&#x2013;Based Data: Case Study Using Real Colorectal Cancer Data</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Park</surname><given-names>NaYoung</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Na</surname><given-names>Kyungmin</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sunwoo</surname><given-names>Woongsang</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Baek</surname><given-names>Jeong-Heum</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Youngho</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Lee</surname><given-names>Suehyun</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Woo</surname><given-names>Hyekyung</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Health Administration, Kongju National University</institution><addr-line>Gongju-Si, Chungcheongnam-do</addr-line><addr-line>Gongju</addr-line><country>Republic of Korea</country></aff><aff id="aff2"><institution>Office of eHealth Research and Business, Seoul National University Bundang Hospital</institution><addr-line>Seongnam-si</addr-line><country>Republic of Korea</country></aff><aff id="aff3"><institution>Department of Computer Engineering, College of IT Convergence, Gachon University</institution><addr-line>Seongnam</addr-line><country>Republic of Korea</country></aff><aff id="aff4"><institution>Department of Otorhinolaryngology, Gil Medical Center, Gachon University, College of Medicine</institution><addr-line>Incheon</addr-line><country>Republic of Korea</country></aff><aff id="aff5"><institution>Division of Colon and Rectal Surgery, Department of Surgery, Gil Medical Center, Gachon University, College of Medicine</institution><addr-line>Incheon</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Bracken-Clarke</surname><given-names>Dara</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Osman</surname><given-names>Mohamed Hosny</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hyekyung Woo, PhD, Department of Health Administration, Kongju National University, Gongju-Si, Chungcheongnam-do, Gongju, 32588, Republic of Korea, 82 41-850-0328; <email>hkwoo@kongju.ac.kr</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>13</day><month>11</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e73884</elocation-id><history><date date-type="received"><day>13</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>06</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; NaYoung Park, Kyungmin Na, Woongsang Sunwoo, Jeong-Heum Baek, Youngho Lee, Suehyun Lee, Hyekyung Woo. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 13.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e73884"/><abstract><sec><title>Background</title><p>As data-driven medical research advances, vast amounts of medical data are being collected, giving researchers access to important information. However, issues such as heterogeneity, complexity, and incompleteness of datasets limit their practical use. Errors and missing data negatively affect artificial intelligence&#x2013;based predictive models, undermining the reliability of clinical decision-making. Thus, it is important to develop a quality management process (QMP) for clinical data.</p></sec><sec><title>Objective</title><p>This study aimed to develop a rules-based QMP to address errors and impute missing values in real-world data, establishing high-quality data for clinical research.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used clinical data from 6491 patients with colorectal cancer (CRC) collected at Gachon University Gil Medical Center between 2010 and 2022, leveraging the clinical library established within the Korea Clinical Data Use Network for Research Excellence. First, we conducted a literature review on the prognostic prediction of CRC to assess whether the data met our research purposes, comparing selected variables with real-world data. A labeling process was then implemented to extract key variables, which facilitated the creation of an automatic staging library. This library, combined with a rule-based process, allowed for systematic analysis and evaluation.</p></sec><sec sec-type="results"><title>Results</title><p>Theoretically, the tumor, node, metastasis (TNM) stage was identified as an important prognostic factor for CRC, but it was not selected through feature selection in real-world data. After applying the QMP, rates of missing data were reduced from 75.3% to 35.7% for TNM and from 24.3% to 18.5% for surveillance, epidemiology, and end results across 6491 cases, confirming the system&#x2019;s effectiveness. Variable importance analysis through feature selection revealed that TNM stage and detailed code variables, which were previously unselected, were included in the improved model.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>In sum, we developed a rules-based QMP to address errors and impute missing values in Korea Clinical Data Use Network for Research Excellence data, enhancing data quality. The applicability of the process to real-world datasets highlights its potential for broader use in clinical studies and cancer research.</p></sec></abstract><kwd-group><kwd>quality management</kwd><kwd>medical data</kwd><kwd>real-world data</kwd><kwd>colorectal cancer</kwd><kwd>data quality</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Medical datasets include various forms of data such as patients&#x2019; health status, diagnosis, and treatment information, collected through electronic medical records, diagnostic tests, and treatment records [<xref ref-type="bibr" rid="ref1">1</xref>]. These data support patient-specific treatment and accurate decision-making by medical professionals [<xref ref-type="bibr" rid="ref2">2</xref>]. With the growing importance of data-driven medical research, studies using medical data have become increasingly common [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Advancements in artificial intelligence (AI) and machine learning technologies have further expanded the potential uses of these data, such as for early disease diagnosis and prediction model development [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>As the volume of medical data grows, infrastructures are being established to analyze and use the data efficiently [<xref ref-type="bibr" rid="ref6">6</xref>]. Data sharing and linkage enable researchers to access the necessary data more easily. However, challenges such as heterogeneity and incompleteness of datasets remain [<xref ref-type="bibr" rid="ref7">7</xref>]. For example, during the pseudonymization of integrated medical data, some information may be restricted, and differences in data formats or structures can compromise consistency during adjustment.</p><p>Issues such as missing data, inconsistencies, and errors can degrade data quality [<xref ref-type="bibr" rid="ref8">8</xref>]. Medical data often exhibit imbalance, where some categories of data are underrepresented, which can lead to biased learning and distorted outcomes in AI-based predictive models [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. These quality issues can undermine the reliability of analysis results. Therefore, it is essential to develop a quality management process (QMP) to correct errors and supplement data to improve the quality of medical data and build high-quality datasets. Given the current shortage of specialized personnel trained in handling and managing raw data, it is crucial to manage data quality effectively and enhance usability through systematic and standardized QMPs.</p><p>In the medical field, an increasing number of studies have addressed data quality issues [<xref ref-type="bibr" rid="ref11">11</xref>]. Evaluations of data quality using colon cancer data and proposals for QMPs and frameworks are gaining traction [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Recently, new methodologies for managing the quality of AI training data have been introduced [<xref ref-type="bibr" rid="ref14">14</xref>], helping to establish high-quality datasets that meet research purposes for diagnosis and prognosis prediction [<xref ref-type="bibr" rid="ref15">15</xref>]. While medical data play a decisive role in clinical research and patient treatment, systematic quality management that ensures the consistency, accuracy, and completeness of data is crucial for solving various errors and dealing with missing information [<xref ref-type="bibr" rid="ref16">16</xref>]. Although comprehensive quality management methodologies for the medical data collection stage are emerging [<xref ref-type="bibr" rid="ref17">17</xref>], processes applicable to real-world data (RWD) are still lacking.</p><p>Therefore, the aim of this study is to develop a QMP for colorectal cancer (CRC) data from the Korea Clinical Data Use Network for Research Excellence (K-CURE). This process was designed to systematically align with the research objectives, identifying key prognostic variables for CRC. We implemented a rule-based approach to improve data completeness and evaluated the effectiveness of the QMP by comparing the data before and after its application.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Stage 1: Planning Stage</title><sec id="s2-1-1"><title>Data Resources</title><p>We used CRC clinical library data established in the K-CURE project at Gachon University Gil Medical Center, approved for use through an institutional review board exemption (GFIRB2024-169). The K-CURE project supports AI-based research and technology development by sharing, providing access to, and linking clinical data from various hospitals. We used a pseudonymized clinical library of 6491 patients with CRC, collected between 2010 and 2022 for the K-CURE project. The pseudonymized clinical library refers to a deidentified dataset in which personally identifiable information has been removed and replaced with pseudonyms. The K-CURE clinical library includes patient information, medical history, diagnoses, cancer staging, test results, treatments, and follow-up data. In addition, structured text-based reports of imaging test results and pathology data from the clinical library were integrated to perform quality management.</p></sec><sec id="s2-1-2"><title>Ethical Considerations</title><p>The study used CRC clinical library data established in the K-CURE project at Gachon University Gil Medical Center, which was approved for use through an institutional review board exemption (GFIRB2024-169). The dataset was pseudonymized, and personally identifiable information was removed and replaced with pseudonyms. Informed consent was waived due to the use of deidentified retrospective data. No compensation was provided to participants. Privacy and confidentiality of patient data were strictly maintained throughout the study.</p></sec><sec id="s2-1-3"><title>Study Design</title><p>In Stage 1, we planned the overall research design to establish a QMP for clinical data that meets our research objectives. To systematize the quality management procedures, we designed a detailed step-by-step process across 4 stages: planning, identification, operation, and evaluation.</p><p>In the identification stage, we assessed the general status of the RWD to identify areas requiring quality management. In the operation stage, the QMP was applied to the identified targets. Finally, in the evaluation stage, we compared the pre- and post-quality management results to assess improvements in the data. The overall flow of this study is presented in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study design. DB: Database; RWD: real-world data.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73884_fig01.png"/></fig></sec></sec><sec id="s2-2"><title>Stage 2: Identification Stage</title><sec id="s2-2-1"><title>Literature Review to Identify Prognostic Factors</title><p>In Stage 2, we conducted a literature review to verify whether the K-CURE CRC data are suitable for constructing a prognostic prediction model. In particular, we sought to identify the key factors influencing the prognosis of patients with CRC and the major variables to consider for constructing a prognostic prediction model for CRC. We searched PubMed for articles published from 2010 to 2024. Our key search terms were (CRC OR colorectal OR CRC) AND (prognosis OR prognostic factor OR predict OR risk factor). The inclusion criteria were as follows: articles published between January 1, 2010, and March 31, 2024, and studies that focused on overall survival, mortality, or 5-year survival as dependent variables. The exclusion criteria included studies with low relevance to the topic or insufficient information on prognostic factors for patients with CRC, and those that discussed only a research design without specific findings. Key influencing factors identified from the selected literature were quantified, and theoretically important factors were derived. These were then used to establish variables for the prognostic prediction model.</p></sec><sec id="s2-2-2"><title>Feature Selection for Identifying Prognostic Factors</title><p>We performed feature selection to identify prognostic factors in the K-CURE CRC data. The Gradient Boosting Classifier was used to evaluate the importance of variables, and the results were compared to theoretically important variables. This model was selected due to its robustness in handling missing values and its effectiveness in evaluating variable importance, which makes it suitable for real-world clinical datasets [<xref ref-type="bibr" rid="ref18">18</xref>]. Variables with low importance or those inconsistent with the literature review findings were selected as target variables requiring quality management. To conduct quality management, we performed frequency analysis of the major variables of the prognostic prediction model. Then, the error and missing data rates for these target variables were reviewed to examine the overall data distribution. The rate of missing data was calculated using frequency analysis for each variable. Error rates were measured by comparing manually generated stage codes with the data of 164 randomly selected samples, limited to cases without missing data.</p></sec></sec><sec id="s2-3"><title>Stage 3: Operation Stage</title><p><xref ref-type="fig" rid="figure2">Figure 2</xref> provides a schematic of the overall QMP.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Schematic diagram of our proposed quality management program. RWD: real-world data; SEER: surveillance, epidemiology, and end result; TNM: tumor, node, metastasis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73884_fig02.png"/></fig><sec id="s2-3-1"><title>Critical Indicator Labeling for Automated Stage Classification Library</title><p>The target variables, tumor, node, and metastasis (TNM) and surveillance, epidemiology, and end results (SEER), are critical indicators for evaluating CRC staging. TNM stage is a standardized cancer stage classification system of the American Joint Committee on Cancer, based on the 8th edition of the American Joint Committee on Cancer Cancer Staging Manual [<xref ref-type="bibr" rid="ref19">19</xref>]. It evaluates the progression of cancer based on tumor depth, lymph node metastasis, and distant metastasis. SEER summary stage is a standardized cancer staging system widely used in international cancer registration systems to classify how far cancer spreads from the primary site of origin.</p><p>Before establishing the QMP, a case analysis was conducted to correct errors and address missing data in the target variables. This analysis involved a detailed review of the TNM and SEER variables of cases in the CRC sample data. We identified cases for which the staging information was omitted or incorrectly recorded to assess the completeness and accuracy of the TNM and SEER variables. We also confirmed whether the missing or erroneous staging information could be supplemented using pathology reports and imaging test results according to a standardized classification system.</p><p>To identify key indicators for extracting target variables, we referred to the CRC guidelines, &#x201C;Korean Clinical Guideline for Colon and Rectal Cancer v.1.0 [<xref ref-type="bibr" rid="ref20">20</xref>],&#x201D; and the most recent SEER manual, &#x201C;Summary Stage 18[<xref ref-type="bibr" rid="ref21">21</xref>].&#x201D; Labeling was conducted on specific words and keywords to identify detailed codes for TNM and SEER in the pathology report and imaging test results, respectively. In the labeling process, medical knowledge related to CRC was incorporated to establish coding conditions and patterns for accurate staging extraction.</p></sec><sec id="s2-3-2"><title>Development of QMPs and Improving CRC Data for Research</title><p>In total, 164 cases were randomly selected, and TNM and SEER codes were manually generated for each case. This process adhered to standardized guidelines and protocols for CRC diagnosis and staging classification. To evaluate data quality, the manually generated codes were compared with the corresponding codes in the existing dataset for the same cases, excluding those with missing values. The error rate was calculated based on the number of discrepancies identified through this comparison. The manually generated TNM and SEER code data were also used as reference criteria for validating the automated stage classification library and used as basic data to evaluate the accuracy and consistency of the generated codes.</p><p>We evaluated whether the automated library corresponded to guidelines in terms of extracting accurate staging information from clinical data. Then, the accuracy of the library was verified by comparing the concordance between the manually generated TNM and SEER codes and the codes derived from the library. This process focused on the consistency of codes, reasons for discrepancies, and major patterns.</p></sec></sec><sec id="s2-4"><title>Stage 4: Evaluation Stage</title><p>In Stage 4, the data generated by applying the QMP was evaluated. By comparing the rates of missing data for target variables before and after quality management, we could confirm to what extent the missing values were corrected through the process. Based on the data before and after quality management, initial and improved prognosis prediction models were constructed, and their performances were compared. Model performance was evaluated according to metrics such as accuracy, precision, recall, <italic>F</italic><sub>1</sub>-score, and area under the receiver operating characteristic curve, to assess whether the application of the QMP improved predictive performance. In addition, we analyzed the impact of target variables on CRC prognosis by checking the importance of variables in the model through feature selection before and after quality management. The prognosis prediction model was constructed using the Gradient Boosting algorithm, and the dependent variable was set as 5-year survival using death information. Python (version 3.12) was used for statistical analysis.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Stage 2: Data Descriptive Study Results</title><p>Based on the literature review, the most frequently identified prognostic factors were T stage (tumor invasion depth) and N stage (lymph node metastasis), cited in 33 and 32 articles, respectively. Other significant factors included M stage (distant metastasis), the integrated TNM staging system, tumor location, pathological differentiation, and carcinoembryonic antigen levels. Staging may be classified as clinical TNM, pathological TNM, or postneoadjuvant pathological TNM.</p><p>As a result of stage 2, variables requiring quality management were identified. A summary of the variables derived from the literature review and feature selection is presented in <xref ref-type="table" rid="table1">Table 1</xref>. As target variables, we selected TNM stage and SEER, which are theoretically important for prognostic prediction.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of literature review and feature selection results.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Factors</td><td align="left" valign="top">Values</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Literature review, prognostic factors (n)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prognostic factors</td><td align="left" valign="top">N</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T stage (depth of invasion)</td><td align="char" char="." valign="top">33</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N stage (lymph node metastasis)</td><td align="char" char="." valign="top">32</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M stage (distant metastasis)</td><td align="char" char="." valign="top">11</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tumor, node, metastasis staging</td><td align="char" char="." valign="top">18</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tumor grade or pathology</td><td align="char" char="." valign="top">40</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Carcinoembryonic antigen (ng/mL)</td><td align="char" char="." valign="top">36</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tumor diameter/length/size (cm)</td><td align="char" char="." valign="top">25</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Histological type</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Neutrophil-to-lymphocyte ratio</td><td align="char" char="." valign="top">15</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Adjuvant chemotherapy</td><td align="char" char="." valign="top">20</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Liver metastasis</td><td align="char" char="." valign="top">13</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lymphatic invasion</td><td align="left" valign="top">11</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Platelet-to-lymphocyte ratio</td><td align="char" char="." valign="top">8</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lymphocyte-to-monocyte ratio</td><td align="char" char="." valign="top">8</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of retrieved lymph nodes</td><td align="char" char="." valign="top">8</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Venous invasion</td><td align="char" char="." valign="top">7</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chemotherapy</td><td align="char" char="." valign="top">7</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ECOG (performance status)</td><td align="char" char="." valign="top">7</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vascular invasion</td><td align="char" char="." valign="top">6</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Perineural invasion</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Metastatic site (number of)</td><td align="char" char="." valign="top">5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CA19-9 (U/ml)</td><td align="char" char="." valign="top">5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Glasgow prognostic score</td><td align="char" char="." valign="top">5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>American Society of Anesthesiologists grade</td><td align="char" char="." valign="top">5</td></tr><tr><td align="left" valign="top" colspan="2">Feature selection, importance</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Year of initial visit_2022</td><td align="char" char="." valign="top">0.231758</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SEER<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>_2.0</td><td align="char" char="." valign="top">0.172401</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Age</td><td align="char" char="." valign="top">0.047391</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Histological diagnosis_16.0</td><td align="char" char="." valign="top">0.045125</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Current_drinking_status_1.0</td><td align="char" char="." valign="top">0.037545</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Year of initial visit_2017</td><td align="char" char="." valign="top">0.037528</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Perineural invasion_3.0</td><td align="char" char="." valign="top">0.036643</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Family history_cancer_1.0</td><td align="char" char="." valign="top">0.033641</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Perineural invasion_2.0</td><td align="char" char="." valign="top">0.033559</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Perineural invasion_nan</td><td align="char" char="." valign="top">0.033198</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Primary site_C18.5</td><td align="char" char="." valign="top">0.02482</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Current_smoke_status_nan</td><td align="char" char="." valign="top">0.022937</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Histological diagnosis_26.0</td><td align="char" char="." valign="top">0.020962</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Histological diagnosis_23.0</td><td align="char" char="." valign="top">0.01356</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Primary site_C18.1</td><td align="char" char="." valign="top">0.012204</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lymphatic invasion_2.0</td><td align="char" char="." valign="top">0.011136</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>TNM_T4N2M1</td><td align="char" char="." valign="top">0.010998</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Primary site_C18</td><td align="char" char="." valign="top">0.010715</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Primary site_C18.3</td><td align="char" char="." valign="top">0.010592</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Primary site_K83.8</td><td align="char" char="." valign="top">0.010291</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Molecular_pathology_findings_nan</td><td align="char" char="." valign="top">0.008958</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Primary site_C17.0</td><td align="char" char="." valign="top">0.008938</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BMI</td><td align="char" char="." valign="top">0.008858</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>SEER: surveillance, epidemiology, and end results.</p></fn></table-wrap-foot></table-wrap><p>The results of the frequency analysis of the major variables are shown in <xref ref-type="table" rid="table2">Table 2</xref>. Among the key variables, missing data were observed for height, weight, BMI, total lymph nodes, positive lymph nodes, and the target variables TNM and SEER. The rate of missing data for TNM stage was notably high at 75.3%, while that for SEER was 24.3% across 6491 cases. Moreover, when the error rate was measured using manually generated stage codes from 164 randomly selected samples, the error rate for TNM stage was 50% (43 errors out of 86 nonmissing cases). For the SEER variable, the error rate was 31.1% (47 errors out of 151 nonmissing cases).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Patient characteristics and missing rates of target variables (N=6491).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variables and categories</td><td align="left" valign="bottom">N (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Sex, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">3936 (60.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">2555 (39.4)</td></tr><tr><td align="left" valign="top">Age, mean (SD)</td><td align="left" valign="top">66.79 (13.4)</td></tr><tr><td align="left" valign="top" colspan="2">Dead, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">394 (6.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">6097 (93.9)</td></tr><tr><td align="left" valign="top" colspan="2">5 y survival, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">6131 (94.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">360 (5.6)</td></tr><tr><td align="left" valign="top">Height, mean (SD)</td><td align="left" valign="top">162.00 (9.15)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing, mean (SD)</td><td align="left" valign="top">2144 (33.0)</td></tr><tr><td align="left" valign="top">Weight, mean (SD)</td><td align="left" valign="top">62.44 (11.88)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing, mean (SD)</td><td align="left" valign="top">2135 (32.9)</td></tr><tr><td align="left" valign="top">BMI mean (SD)</td><td align="left" valign="top">23.72 (3.60)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">2146 (33.1)</td></tr><tr><td align="left" valign="top">Total lymph node, mean (SD)</td><td align="left" valign="top">20.25 (12.05)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing, n (%)</td><td align="left" valign="top">2633 (40.6)</td></tr><tr><td align="left" valign="top">Positive lymph node, mean (SD)</td><td align="left" valign="top">1.92 (4.40)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing, n (%)</td><td align="left" valign="top">2633 (40.6)</td></tr><tr><td align="left" valign="top" colspan="2">Operation, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">2631 (40.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">3860 (59.5)</td></tr><tr><td align="left" valign="top" colspan="2">Chemotherapy, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">224 (3.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">6267 (96.6)</td></tr><tr><td align="left" valign="top" colspan="2">Radiotherapy, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">383 (5.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">6108 (94.1)</td></tr><tr><td align="left" valign="top" colspan="2">Complication after surgery, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">524 (8.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">5967 (91.9)</td></tr><tr><td align="left" valign="top" colspan="2">SEER<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0</td><td align="left" valign="top">355 (5.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">1818 (28)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">806 (12.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">192 (3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">890 (13.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>5</td><td align="left" valign="top">14 (0.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7</td><td align="left" valign="top">792 (12.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>9</td><td align="left" valign="top">48 (0.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">1576 (24.3)</td></tr><tr><td align="left" valign="top">T stage, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0</td><td align="left" valign="top">1 (0)</td></tr><tr><td align="left" valign="top">Tis, n (%)</td><td align="left" valign="top">1 (0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">304 (4.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">238 (3.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">814 (12.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">248 (3.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">4885 (75.3)</td></tr><tr><td align="left" valign="top">N stage, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0</td><td align="left" valign="top">968 (14.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">399 (6.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">235 (3.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">3 (0.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">1 (0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">4885 (75.3)</td></tr><tr><td align="left" valign="top">M stage, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0</td><td align="left" valign="top">1459 (22.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">147 (2.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>missing</td><td align="left" valign="top">4885 (75.3)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>SEER: surveillance, epidemiology, and end results.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Stage 3: Data Quality Management</title><p>We developed guidelines for creating an automated stage classification library. Examples of critical indicator terms identified for TNM and SEER through labeling are highlighted in italics in <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>, respectively. These guidelines define labeled terms and conditions that allow rule-based automated classification of cancer stage.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Tumor, node, metastasis stage labeling following the Korean clinical guideline for colorectal cancer v.1.0, with critical indicator terms in italics.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Stage</td><td align="left" valign="bottom">Labels</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Pathology report</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">No residual tumor</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tis</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Confinement to mucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invasion to <italic>lamina propria</italic></td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pTis)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invades <italic>submucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invasion to <italic>submucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invasion into <italic>submucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invasion to <italic>muscularis mucosae</italic></td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pT1) /(ypT1)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invades <italic>muscularis propria</italic></td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pT2) /(ypT2)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T3</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invades <italic>pericolic adipose tissue</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invades <italic>perirectal adipose tissue</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invades <italic>subserosa</italic></td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pT3) /(ypT3)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T4</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Penetrates <italic>visceral peritoneum</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Penetration to serosa and perforation</td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pT4a) /(ypT4)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Direct invades <italic>adjacent organs</italic> or <italic>structures</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Directly invades adjacent organ</td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pT4b)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No metastasis</italic> in - <italic>regional lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No metastasis</italic> in - <italic>pericolic lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No metastasis</italic> in - <italic>perirectal lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No metastasis</italic> in - <italic>pericolic and perirectal lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No metastasis</italic> in - <italic>pericolic and peri-ileal lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No metastasis</italic> in - <italic>lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No tumor present in</italic> 16 <italic>regional lymph nodes</italic> (0/16)</td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pN0) /(yN0) /(ypN0)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Metastasis in <italic>1</italic> of ~ <italic>regional lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pN1a) /(ypN1a)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Metastasis in <italic>2 (or 3)</italic> of ~ <italic>regional lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pN1b) /(ypN1b)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Tumor deposit</italic> present</td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pN1c) /(ypN1c)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Metastasis in <italic>4 (more than)</italic> of ~ <italic>regional lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pN2a) /(ypN2a)</td></tr><tr><td align="left" valign="top"/><td align="char" char="." valign="top">(pN2b) /(ypN2b)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic adenocarcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Adenocarcinoma, metastatic</italic> from</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic colonic adenocarcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic carcinoma</italic> of rectum</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic mixed adenoneuroendocrine carcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic appendiceal high-grade goblet cell adenocarcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic mucinous adenocarcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic mucinous carcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Consistent with metastatic carcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Omental seeding</italic></td></tr><tr><td align="left" valign="top" colspan="2">Imaging examination results</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of abnormal wall thickening</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No visible definite</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tis</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Tis</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Invasion of lamina propria</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>T1</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Submucosal invasion</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>T2</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T3</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>T3</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Pericolic (fat) infiltration</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Perirectal (fat) infiltration</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Mesorectal fat infiltration</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Subserosal invasion</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T4</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>T4</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>T4a /T4b</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Visceral peritoneum</italic></td></tr><tr><td align="left" valign="top" colspan="2">Synonym: LN(s), L/N(s), lymph node(s)<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>N0</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No enlarged</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No abnormal enlarging</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No pathologic</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Nor enlarged</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of regional</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of enlarged</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of enlarged regional</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No significant</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No significant enlarged</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No significant enlargement</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No significant enlarged peritumoral</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No visible enlarged</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>N1</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Regional</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastases</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Regional metastatic</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Regional - metastasis (metastases)</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>With regional lymph node metastasis</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>N2</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Multiple regional metastatic</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Multiple regional - metastasis/metastases</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Several regional - metastasis/metastases</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Several regional - metastasis</italic></td></tr><tr><td align="left" valign="top" colspan="2">Synonym: metastasis, metastases, metastatic<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of distant</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of definite distant</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of liver</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of hepatic</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Nor distant</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Nor or no visible</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Rather than</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of enlarged regional L/N or distant metastasis</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Bone</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Liver</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Hepatic</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Pulmonary</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Several</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>No evidence of distant</italic></td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>The terms listed as synonyms should be used together with the N stage labels to create the labeling.</p></fn><fn id="table3fn2"><p><sup>b</sup>The terms listed as synonyms should be used together with the M stage labels to create the labeling.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>SEER<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> labeling by Summary Stage 2018, with critical indicator terms in italics.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">SEER code</td><td align="left" valign="bottom">Labels</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Pathology report</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Intraepithelial</italic></td></tr><tr><td align="char" char="." valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Intramucosal</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Confinement in the <italic>lamina propria</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invasion to <italic>lamina propria</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Confinement to mucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Invasion to mucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Extension to mucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Involvement of mucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invasion to <italic>muscularis mucosae</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Invades muscularis propria</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Invades submucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Invasion to submucosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invasion into submucosa</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invasion to the submucosa</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Submucosal invasion</italic></td></tr><tr><td align="char" char="." valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Directly invades <italic>adjacent organ</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Direct invades <italic>adjacent organs or structures</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Directly invades adjacent organs or structures</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Penetrates <italic>visceral peritoneum</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Penetration of <italic>visceral peritoneum</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invades <italic>subserosa</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invades <italic>pericolic adipose tissue</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Invades <italic>perirectal adipose tissue</italic></td></tr><tr><td align="char" char="." valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastasis</italic> in <italic>1</italic> of <italic>regional lymph nodes</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>With metastasis of pericolorectal lymph node</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Tumor deposit</italic></td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">Codes 2+3 (cases corresponding to both Code 2 and Code 3)</td></tr><tr><td align="char" char="." valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic adenocarcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Adenocarcinoma, metastatic from colon or rectum</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic mixed adenoneuroendocrine carcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic colonic adenocarcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Metastatic carcinoma</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Distant lymph node(s)</italic></td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>9<sup><xref ref-type="table-fn" rid="table4fn8">h</xref></sup></td><td align="left" valign="top">In cases without evidence</td></tr><tr><td align="left" valign="top" colspan="2">Imaging examination results</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0</td><td align="char" char="." valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn9">i</xref></sup></td></tr><tr><td align="char" char="." valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Invasion of lamina propria</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Submucosal invasion</italic></td></tr><tr><td align="char" char="." valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2<named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Pericolic fat infiltration</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Pericolic infiltration</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Perirectal infiltration</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Perirectal fat infiltration</italic></td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">If the N code is 1 or higher</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">Codes 2+3 (cases corresponding to both Code 2 and Code 3)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7</td><td align="left" valign="top">If the M code is 1 or higher</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>9</td><td align="left" valign="top">In cases without evidence</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>SEER: surveillance, epidemiology, and end results.</p></fn><fn id="table4fn2"><p><sup>b</sup>0: in situ.</p></fn><fn id="table4fn3"><p><sup>c</sup>1: localized only. </p></fn><fn id="table4fn4"><p><sup>d</sup>2: regional by direct extension only. </p></fn><fn id="table4fn5"><p><sup>e</sup>3: regional lymph node(s) involved only. </p></fn><fn id="table4fn6"><p><sup>f</sup>4: regional by both direct extension and regional lymph node(s) involvement. </p></fn><fn id="table4fn7"><p><sup>g</sup>7: distant site(s)/lymph node(s) involved. </p></fn><fn id="table4fn8"><p><sup>h</sup>9: unknown if extension or metastasis.</p></fn><fn id="table4fn9"><p><sup>i</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>As a result of the evaluation of the automated stage classification library, the concordance rates were 93.3% for TNM and 93.9% for SEER across the 164 cases. By leveraging a rule-based database in the QMP, we were able to supplement missing data in the target variables, resulting in a dataset aligned with the objectives of prognostic prediction.</p></sec><sec id="s3-3"><title>Stage 4: Postassessment Based on RWD</title><p>Comparing the rates of missing data before and after the QMP, the rate decreased from 75.3% to 35.7% for the TNM and from 24.3% to 18.5% for the SEER across 6491 cases. This demonstrates the effectiveness of the QMP (<xref ref-type="fig" rid="figure3">Figure 3</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Missing values before and after quality management. SEER: surveillance, epidemiology, and end result; TNM: tumor, node, metastasis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73884_fig03.png"/></fig><p><xref ref-type="table" rid="table5">Table 5</xref> presents a comparison of the performance of the models before and after the QMP; a slight improvement was observed. An evaluation of variable importance by feature selection revealed that TNM stage and detailed code variables (T, N, M), which were not identified before quality management, emerged as significant variables after quality management. The variable importance values are shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>, and the corresponding importance values are detailed in <xref ref-type="table" rid="table6">Table 6</xref>. Incorporating these newly identified prognostic indicators into the final model enhances its clinical relevance and interpretability.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Model performance before and after quality management.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top"/><td align="left" valign="top">Before quality management</td><td align="left" valign="top">After quality management</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="char" char="." valign="top">0.933795227</td><td align="char" char="." valign="top">0.9407236336</td></tr><tr><td align="left" valign="top">Precision</td><td align="char" char="." valign="top">0.924949499</td><td align="char" char="." valign="top">0.9279243167</td></tr><tr><td align="left" valign="top">Recall</td><td align="char" char="." valign="top">0.933795227</td><td align="char" char="." valign="top">0.9407236336</td></tr><tr><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="char" char="." valign="top">0.92898597</td><td align="char" char="." valign="top">0.9330359000</td></tr><tr><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="char" char="." valign="top">0.856226406</td><td align="char" char="." valign="top">0.8724494672</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Change in feature importance before and after quality management.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73884_fig04.png"/></fig><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Feature importance before and after quality management.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Importance</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Before quality management</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>year of initial visit_2022</td><td align="left" valign="top">0.23176</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SEER_2.0</td><td align="left" valign="top">0.17240</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Age</td><td align="left" valign="top">0.04739</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>histological diagnosis_16.0</td><td align="left" valign="top">0.04513</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>current_drinking_status_1.0</td><td align="left" valign="top">0.03755</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>year of initial visit_2017</td><td align="left" valign="top">0.03753</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>perineural invasion_3.0</td><td align="left" valign="top">0.03664</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>family history_cancer_1.0</td><td align="left" valign="top">0.03364</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>perineural invasion_2.0</td><td align="left" valign="top">0.03356</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>perineural invasion_nan</td><td align="left" valign="top">0.03320</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C18.5</td><td align="left" valign="top">0.02482</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>current_smoke_status_nan</td><td align="left" valign="top">0.02294</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>histological diagnosis_26.0</td><td align="left" valign="top">0.02096</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>histological diagnosis_23.0</td><td align="left" valign="top">0.01356</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C18.1</td><td align="left" valign="top">0.01220</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>lymphatic invasion_2.0</td><td align="left" valign="top">0.01114</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>TNM_T4N2M1</td><td align="left" valign="top">0.01100</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C18</td><td align="left" valign="top">0.01072</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C18.3</td><td align="left" valign="top">0.01059</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_K83.8</td><td align="left" valign="top">0.01029</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>molecular_pathology_findings_nan</td><td align="left" valign="top">0.00896</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C170.</td><td align="left" valign="top">0.00894</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BMI<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup></td><td align="left" valign="top">0.00886</td></tr><tr><td align="left" valign="top" colspan="2">After quality management</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>year of initial visit_2022</td><td align="left" valign="top">0.11148</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>TNM<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup>_TxN2M0</td><td align="left" valign="top">0.07741</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N stage_2</td><td align="left" valign="top">0.05068</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>histological diagnosis_16.0</td><td align="char" char="." valign="top">0.05061</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>age</td><td align="char" char="." valign="top">0.05013</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>perineural invasion_nan</td><td align="char" char="." valign="top">0.04725</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SEER<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup>_2.0</td><td align="char" char="." valign="top">0.04599</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>perineural invasion_2.0</td><td align="char" char="." valign="top">0.04532</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>perineural invasion_3.0</td><td align="char" char="." valign="top">0.04489</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>current_drinking_status_1.0</td><td align="char" char="." valign="top">0.03479</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>year of initial visit_2017</td><td align="char" char="." valign="top">0.03067</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>family history_cancer_1.0</td><td align="char" char="." valign="top">0.02972</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C18.5</td><td align="char" char="." valign="top">0.02883</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>TNM_TxNxM0</td><td align="char" char="." valign="top">0.02201</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>histological diagnosis_23.0</td><td align="char" char="." valign="top">0.01986</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>current_smoke_status_nan</td><td align="char" char="." valign="top">0.01954</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C18.1</td><td align="char" char="." valign="top">0.01947</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C18.3</td><td align="char" char="." valign="top">0.01790</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C170.</td><td align="char" char="." valign="top">0.01747</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>year of initial visit_2015</td><td align="char" char="." valign="top">0.01526</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>molecular_pathology_findings_nan</td><td align="char" char="." valign="top">0.01374</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_K83.8</td><td align="char" char="." valign="top">0.01342</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>lymphatic invasion_2.0</td><td align="char" char="." valign="top">0.01218</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T stage_nan</td><td align="char" char="." valign="top">0.01105</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>histological diagnosis_26.0</td><td align="char" char="." valign="top">0.01104</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_C18</td><td align="char" char="." valign="top">0.01093</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>primary site_nan</td><td align="char" char="." valign="top">0.01088</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>BMI: body mass index.</p></fn><fn id="table6fn2"><p><sup>b</sup>TNM: tumor, node, metastasis.</p></fn><fn id="table6fn3"><p><sup>c</sup>SEER: surveillance, epidemiology, and end result</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study proposed a QMP to generate high-quality data. We used the K-CURE dataset to develop the QMP and applied it to a CRC clinical library to evaluate the quality improvement effects. After applying the process, TNM stage and individual T, N, and M codes emerged as important factors when constructing a prognostic model. This suggests that the proposed QMP can create high-quality data for research.</p><p>Gaps in datasets can occur due to direct omissions of data, limitations in data collection, and technical issues [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. Missing values may arise due to patient movement, treatment interruptions, or omitted tests or procedures, resulting in the loss of important variables. Various methods, such as statistical imputation or ML-based techniques, have been proposed to address missing data but often fail to fully reflect the complexity of clinical environments [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. This reduces the reliability of data over the long term, affecting dataset quality and reducing the reliability of findings.</p><p>Various basic statistical methods, such as imputation, have been used to address missing data [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. More recently, ML-based methods such as K-nearest neighbor [<xref ref-type="bibr" rid="ref29">29</xref>], matrix factorization [<xref ref-type="bibr" rid="ref30">30</xref>], and random forest approaches have also emerged [<xref ref-type="bibr" rid="ref31">31</xref>]. These methods are effective when missing data are not random and do not follow specific patterns, as they learn from the dataset itself and predict missing values [<xref ref-type="bibr" rid="ref32">32</xref>]. This makes them relatively insensitive to the rates or patterns of missing data. Novel techniques such as attention-based models [<xref ref-type="bibr" rid="ref33">33</xref>] or the large language model forest framework have also been applied [<xref ref-type="bibr" rid="ref34">34</xref>]. However, previous studies have focused on evaluating and replacing missing values, rather than applying multistage processes to improve overall data quality.</p><p>In this study, we reviewed several previous studies on CRC to construct an improved dataset and identify prognostic factors. For clinical research, it is crucial to identify and evaluate factors with strong evidence-based associations with prognoses [<xref ref-type="bibr" rid="ref35">35</xref>]. However, in our study, theoretically important variables were not always selected from the actual data, and some missing values could not be addressed through the QMP. This indicates that there was a lack of information on important variables during the initial stages of data construction. Therefore, important prognostic variables should be thoroughly reviewed and systematically managed from the initial stages of data construction.</p><p>Using CRC staging guidelines, we performed labeling by extracting text-based terms from pathology reports and imaging test results to establish a rule-based QMP. Recently, there has been a trend toward research focusing on developing rule-based quality management and quality assessment methodologies using medical data. This expands the possibility of systematically detecting and correcting errors in data [<xref ref-type="bibr" rid="ref36">36</xref>]. This approach effectively analyzes clinical quality issues, improves data accuracy, and provides reliable information for clinical research and decision-making [<xref ref-type="bibr" rid="ref37">37</xref>]. Such a strategy has been found to be applicable to real-world medical data [<xref ref-type="bibr" rid="ref38">38</xref>]. The QMP developed in this study shows the utility of rule-based systems, generating data with improved completeness. Applying this approach could provide accurate data for future prognostic prediction and decision support systems.</p><p>Traditional quality management methodologies focus on preventing and correcting errors during data construction and operation [<xref ref-type="bibr" rid="ref39">39</xref>]. For example, such methods often rely on automated systems or checklists to minimize input errors or to validate the accuracy of collected data [<xref ref-type="bibr" rid="ref40">40</xref>]. However, we propose a rule-based QMP that identifies and corrects missing values and errors in datasets that are already established. This approach not only addresses potential issues that can occur during the data construction phase, but also facilitates the detection and resolution of missing data that arise during data analysis.</p><p>Recently, there have been active attempts in medical research to develop QMP systems using various clinical and public datasets, including electronic medical record data [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]. This approach is essential for institutions with large-scale medical datasets and platforms built from multiple integrated datasets. In multi-center research, a method to prioritize data quality dimensions and key evaluation variables, supported by feedback systems to monitor and assess data quality, has been proposed. This study provides a foundation for the automation of future QMP systems and the development of new approaches using AI and ML, enhancing the usage of medical data by researchers in public data platforms.</p><p>We focused on addressing missing data for quality management; we have not proposed a comprehensive solution for various data errors in clinical environments. Also, a limitation is the complexity of clinical staging decisions&#x2014;involving multidisciplinary discussions, treatments such as neoadjuvant therapy, and surgical findings&#x2014;which can lead to discrepancies or missing values in retrospective research data. This complexity may influence the interpretation of the study results and may affect the generalizability of the data. Nonetheless, this work is important in that we propose a systematic process to improve the quality and applicability of real-world medical data. Future efforts should consider advanced processes that address the entire data lifecycle, from construction to usage and operation.</p></sec><sec id="s4-2"><title>Conclusion</title><p>We developed a rule-based QMP that improves data quality and identifies key prognostic factors in CRC datasets. Although missing data and other complex challenges in real-world clinical data remain, the approach demonstrates the utility of systematic quality management. Future work should expand the QMP to address diverse data errors across the data lifecycle.</p></sec></sec></body><back><ack><p>This research was funded by the National Research Foundation of Korea (NRF; grant number 2020R1C1C009679).</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CRC</term><def><p>colorectal cancer</p></def></def-item><def-item><term id="abb3">K-CURE</term><def><p>Korea Clinical Data Use Network for Research Excellence</p></def></def-item><def-item><term id="abb4">QMP</term><def><p>quality management process</p></def></def-item><def-item><term id="abb5">RWD</term><def><p>real-world data</p></def></def-item><def-item><term id="abb6">SEER</term><def><p> Surveillance, Epidemiology, and End Results</p></def></def-item><def-item><term id="abb7">TNM</term><def><p> tumor, node, metastasis</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Shortliffe</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Barnett</surname><given-names>GO</given-names> </name></person-group><source>Medical Data: Their Acquisition, Storage, and Use Medical Informatics: Computer Applications in Health Care and Biomedicine</source><year>2001</year><publisher-name>Springer</publisher-name><fpage>41</fpage><lpage>75</lpage><pub-id pub-id-type="doi">10.1007/978-0-387-21721-5_2</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayaad</surname><given-names>O</given-names> </name><name name-style="western"><surname>Alloubani</surname><given-names>A</given-names> </name><name name-style="western"><surname>ALhajaa</surname><given-names>EA</given-names> </name><etal/></person-group><article-title>The role of electronic medical records in improving the quality of health care services: comparative study</article-title><source>Int J Med Inform</source><year>2019</year><month>07</month><volume>127</volume><fpage>63</fpage><lpage>67</lpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2019.04.014</pub-id><pub-id pub-id-type="medline">31128833</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>Opportunities and challenges in developing deep learning models using electronic health records data: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>10</month><day>1</day><volume>25</volume><issue>10</issue><fpage>1419</fpage><lpage>1428</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy068</pub-id><pub-id pub-id-type="medline">29893864</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alsuliman</surname><given-names>T</given-names> </name><name name-style="western"><surname>Humaidan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sliman</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dul&#x00E9;ry</surname><given-names>R</given-names> </name></person-group><article-title>Introduction to medical data and big data exploitation in research: errors, solutions and trends</article-title><source>Curr Res Transl Med</source><year>2021</year><month>10</month><volume>69</volume><issue>4</issue><fpage>103310</fpage><pub-id pub-id-type="doi">10.1016/j.retram.2021.103310</pub-id><pub-id pub-id-type="medline">34419934</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>ZH</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>CF</given-names> </name><name name-style="western"><surname>Li</surname><given-names>CF</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>RH</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name></person-group><article-title>Artificial intelligence for assisting cancer diagnosis and treatment in the era of precision medicine</article-title><source>Cancer Commun</source><year>2021</year><month>11</month><volume>41</volume><issue>11</issue><fpage>1100</fpage><lpage>1115</lpage><pub-id pub-id-type="doi">10.1002/cac2.12215</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>M-s</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name></person-group><article-title>Current status and issues of data management plan in Korea</article-title><source>J Korea Contents Assoc</source><year>2020</year><volume>20</volume><issue>6</issue><fpage>220</fpage><lpage>229</lpage><pub-id pub-id-type="doi">10.5392/JKCA.2020.20.06.220</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McGuckin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Crick</surname><given-names>K</given-names> </name><name name-style="western"><surname>Myroniuk</surname><given-names>TW</given-names> </name><name name-style="western"><surname>Setchell</surname><given-names>B</given-names> </name><name name-style="western"><surname>Yeung</surname><given-names>RO</given-names> </name><name name-style="western"><surname>Campbell-Scherer</surname><given-names>D</given-names> </name></person-group><article-title>Understanding challenges of using routinely collected health data to address clinical care gaps: a case study in Alberta, Canada</article-title><source>BMJ Open Qual</source><year>2022</year><month>01</month><volume>11</volume><issue>1</issue><fpage>e001491</fpage><pub-id pub-id-type="doi">10.1136/bmjoq-2021-001491</pub-id><pub-id pub-id-type="medline">34996811</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ta</surname><given-names>CN</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Detecting systemic data quality issues in electronic health records</article-title><source>Stud Health Technol Inform</source><year>2019</year><month>08</month><day>21</day><volume>264</volume><fpage>383</fpage><lpage>387</lpage><pub-id pub-id-type="doi">10.3233/SHTI190248</pub-id><pub-id pub-id-type="medline">31437950</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gehrmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Herczog</surname><given-names>E</given-names> </name><name name-style="western"><surname>Decker</surname><given-names>S</given-names> </name><name name-style="western"><surname>Beyan</surname><given-names>O</given-names> </name></person-group><article-title>What prevents us from reusing medical real-world data in research</article-title><source>Sci Data</source><year>2023</year><month>07</month><day>13</day><volume>10</volume><issue>1</issue><fpage>459</fpage><pub-id pub-id-type="doi">10.1038/s41597-023-02361-2</pub-id><pub-id pub-id-type="medline">37443164</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shafqat</surname><given-names>W</given-names> </name><name name-style="western"><surname>Byun</surname><given-names>YC</given-names> </name></person-group><article-title>A hybrid GAN-based approach to solve imbalanced data problem in recommendation systems</article-title><source>IEEE Access</source><year>2022</year><volume>10</volume><fpage>11036</fpage><lpage>11047</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2022.3141776</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Whang</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Roh</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Song</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JG</given-names> </name></person-group><article-title>Data collection and quality challenges in deep learning: a data-centric AI perspective</article-title><source>VLDB J</source><year>2023</year><month>07</month><volume>32</volume><issue>4</issue><fpage>791</fpage><lpage>813</lpage><pub-id pub-id-type="doi">10.1007/s00778-022-00775-9</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Alalwani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lucas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alzubaidi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Alam</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Deep learning in colorectal cancer classification: a scoping review</article-title><source>Healthcare Transformation with Informatics and Artificial Intelligence</source><year>2023</year><fpage>616</fpage><lpage>619</lpage><pub-id pub-id-type="doi">10.3233/SHTI230573</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedrikovetski</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dudi-Venkata</surname><given-names>NN</given-names> </name><name name-style="western"><surname>Kroon</surname><given-names>HM</given-names> </name><etal/></person-group><article-title>Artificial intelligence for pre-operative lymph node staging in colorectal cancer: a systematic review and meta-analysis</article-title><source>BMC Cancer</source><year>2021</year><month>09</month><day>26</day><volume>21</volume><issue>1</issue><fpage>1058</fpage><pub-id pub-id-type="doi">10.1186/s12885-021-08773-w</pub-id><pub-id pub-id-type="medline">34565338</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rompianesi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Pegoraro</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ceresa</surname><given-names>CD</given-names> </name><name name-style="western"><surname>Montalti</surname><given-names>R</given-names> </name><name name-style="western"><surname>Troisi</surname><given-names>RI</given-names> </name></person-group><article-title>Artificial intelligence in the diagnosis and management of colorectal cancer liver metastases</article-title><source>World J Gastroenterol</source><year>2022</year><month>01</month><day>7</day><volume>28</volume><issue>1</issue><fpage>108</fpage><lpage>122</lpage><pub-id pub-id-type="doi">10.3748/wjg.v28.i1.108</pub-id><pub-id pub-id-type="medline">35125822</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kale</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wankhede</surname><given-names>N</given-names> </name><name name-style="western"><surname>Pawar</surname><given-names>R</given-names> </name><etal/></person-group><article-title>AI-driven innovations in Alzheimer&#x2019;s disease: Integrating early diagnosis, personalized treatment, and prognostic modelling</article-title><source>Ageing Res Rev</source><year>2024</year><month>11</month><volume>101</volume><fpage>102497</fpage><pub-id pub-id-type="doi">10.1016/j.arr.2024.102497</pub-id><pub-id pub-id-type="medline">39293530</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Diaz</surname><given-names>O</given-names> </name><name name-style="western"><surname>Kushibar</surname><given-names>K</given-names> </name><name name-style="western"><surname>Osuala</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Data preparation for artificial intelligence in medical imaging: a comprehensive guide to open-access platforms and tools</article-title><source>Phys Med</source><year>2021</year><month>03</month><volume>83</volume><fpage>25</fpage><lpage>37</lpage><pub-id pub-id-type="doi">10.1016/j.ejmp.2021.02.007</pub-id><pub-id pub-id-type="medline">33684723</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Janett</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Yeracaris</surname><given-names>PP</given-names> </name></person-group><article-title>Electronic medical records in the American health system: challenges and lessons learned</article-title><source>Ci&#x00EA;nc sa&#x00FA;de coletiva</source><year>2020</year><volume>25</volume><issue>4</issue><fpage>1293</fpage><lpage>1304</lpage><pub-id pub-id-type="doi">10.1590/1413-81232020254.28922019</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Malin</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name></person-group><article-title>Predicting missing values in medical data via XGBoost regression</article-title><source>J Healthc Inform Res</source><year>2020</year><month>12</month><volume>4</volume><issue>4</issue><fpage>383</fpage><lpage>394</lpage><pub-id pub-id-type="doi">10.1007/s41666-020-00077-1</pub-id><pub-id pub-id-type="medline">33283143</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Washington</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Brookland</surname><given-names>DR</given-names> </name><name name-style="western"><surname>Gershenwald</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Compton</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Hess</surname><given-names>KR</given-names> </name><etal/></person-group><source>AJCC Cancer Staging Manual</source><year>2017</year><edition>8</edition><publisher-name>New York, NY: Springer</publisher-name><pub-id pub-id-type="other">9783319406176</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Um</surname><given-names>JW</given-names> </name></person-group><source>Korean Clinical Guideline for Colon and Rectal Cancer v 10</source><year>2012</year><publisher-name>Seoul, Korean Academy of Medical Sciences</publisher-name></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ruhl</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Callaghan</surname><given-names>C</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Schussler</surname><given-names>N</given-names> </name></person-group><source>Summary Stage 2018: Codes and Coding Instructions</source><year>2024</year><publisher-name>Bethesda, MD: National Cancer Institute</publisher-name></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Austin</surname><given-names>PC</given-names> </name><name name-style="western"><surname>White</surname><given-names>IR</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>DS</given-names> </name><name name-style="western"><surname>van Buuren</surname><given-names>S</given-names> </name></person-group><article-title>Missing data in clinical research: a tutorial on multiple imputation</article-title><source>Can J Cardiol</source><year>2021</year><month>09</month><volume>37</volume><issue>9</issue><fpage>1322</fpage><lpage>1331</lpage><pub-id pub-id-type="doi">10.1016/j.cjca.2020.11.010</pub-id><pub-id pub-id-type="medline">33276049</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Purwar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>SK</given-names> </name></person-group><article-title>Hybrid prediction model with missing value imputation for medical data</article-title><source>Expert Syst Appl</source><year>2015</year><month>08</month><volume>42</volume><issue>13</issue><fpage>5621</fpage><lpage>5631</lpage><pub-id pub-id-type="doi">10.1016/j.eswa.2015.02.050</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>CF</given-names> </name></person-group><article-title>Missing value imputation: a review and analysis of the literature (2006&#x2013;2017)</article-title><source>Artif Intell Rev</source><year>2020</year><month>02</month><volume>53</volume><issue>2</issue><fpage>1487</fpage><lpage>1509</lpage><pub-id pub-id-type="doi">10.1007/s10462-019-09709-4</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wells</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Chagin</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Nowacki</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Kattan</surname><given-names>MW</given-names> </name></person-group><article-title>Strategies for handling missing data in electronic health record derived data</article-title><source>EGEMS (Wash DC)</source><year>2013</year><volume>1</volume><issue>3</issue><fpage>1035</fpage><pub-id pub-id-type="doi">10.13063/2327-9214.1035</pub-id><pub-id pub-id-type="medline">25848578</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bertsimas</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pawlowski</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhuo</surname><given-names>YD</given-names> </name></person-group><article-title>From predictive methods to missing data imputation: an optimization approach</article-title><source>J Mach Learn Res</source><year>2018</year><access-date>2025-10-17</access-date><volume>18</volume><issue>196</issue><fpage>1</fpage><lpage>39</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://jmlr.org/papers/v18/17-073.html">http://jmlr.org/papers/v18/17-073.html</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raja</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Thangavel</surname><given-names>K</given-names> </name></person-group><article-title>Missing value imputation using unsupervised machine learning techniques</article-title><source>Soft Comput</source><year>2020</year><month>03</month><volume>24</volume><issue>6</issue><fpage>4361</fpage><lpage>4392</lpage><pub-id pub-id-type="doi">10.1007/s00500-019-04199-6</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wo&#x017A;nica</surname><given-names>K</given-names> </name><name name-style="western"><surname>Biecek</surname><given-names>P</given-names> </name></person-group><article-title>Does imputation matter? Benchmark for predictive models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 6, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2007.02837</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Batista</surname><given-names>GEAPA</given-names> </name><name name-style="western"><surname>Monard</surname><given-names>MC</given-names> </name></person-group><article-title>An analysis of four missing data treatment methods for supervised learning</article-title><source>Appl Artif Intell</source><year>2003</year><month>05</month><volume>17</volume><issue>5-6</issue><fpage>519</fpage><lpage>533</lpage><pub-id pub-id-type="doi">10.1080/713827181</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mazumder</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hastie</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tibshirani</surname><given-names>R</given-names> </name></person-group><article-title>Spectral regularization algorithms for learning large incomplete matrices</article-title><source>J Mach Learn Res</source><year>2010</year><month>03</month><day>1</day><volume>11</volume><issue>2287-322</issue><fpage>2287</fpage><lpage>2322</lpage><pub-id pub-id-type="medline">21552465</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stekhoven</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>B&#x00FC;hlmann</surname><given-names>P</given-names> </name></person-group><article-title>MissForest--non-parametric missing value imputation for mixed-type data</article-title><source>Bioinformatics</source><year>2012</year><month>01</month><day>1</day><volume>28</volume><issue>1</issue><fpage>112</fpage><lpage>118</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btr597</pub-id><pub-id pub-id-type="medline">22039212</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thomas</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rajabi</surname><given-names>E</given-names> </name></person-group><article-title>A systematic review of machine learning-based missing value imputation techniques</article-title><source>DTA</source><year>2021</year><month>08</month><day>5</day><volume>55</volume><issue>4</issue><fpage>558</fpage><lpage>585</lpage><pub-id pub-id-type="doi">10.1108/DTA-12-2020-0298</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kowsar</surname><given-names>I</given-names> </name><name name-style="western"><surname>Rabbani</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Samad</surname><given-names>MD</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Kowsar</surname><given-names>I</given-names> </name><name name-style="western"><surname>Rabbani</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Samad</surname><given-names>MD</given-names> </name></person-group><article-title>Attention-based imputation of missing values in electronic health records tabular data</article-title><conf-name>2024 IEEE 12th International Conference on Healthcare Informatics (ICHI)</conf-name><conf-date>Jun 3-6, 2024</conf-date><pub-id pub-id-type="doi">10.1109/ICHI61247.2024.00030</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ban</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>T</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>CB</given-names> </name><name name-style="western"><surname>He</surname><given-names>J</given-names> </name></person-group><article-title>LLM-forest for health tabular data imputation</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 28, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.21520</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>W</given-names> </name><name name-style="western"><surname>He</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Risk factors and risk prediction models for colorectal cancer metastasis and recurrence: an umbrella review of systematic reviews and meta-analyses of observational studies</article-title><source>BMC Med</source><year>2020</year><month>06</month><day>26</day><volume>18</volume><issue>1</issue><fpage>172</fpage><pub-id pub-id-type="doi">10.1186/s12916-020-01618-6</pub-id><pub-id pub-id-type="medline">32586325</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohamed</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Song</surname><given-names>X</given-names> </name><name name-style="western"><surname>McMahon</surname><given-names>TM</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name></person-group><article-title>Tailoring rule-based data quality assessment to the patient-centered outcomes research network (PCORnet) common data model (CDM)</article-title><source>AMIA Annu Symp Proc</source><year>2022</year><volume>2022</volume><fpage>775</fpage><lpage>784</lpage><pub-id pub-id-type="medline">37128433</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Dagtas</surname><given-names>S</given-names> </name><name name-style="western"><surname>Talburt</surname><given-names>J</given-names> </name><name name-style="western"><surname>Baghal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zozus</surname><given-names>M</given-names> </name></person-group><article-title>Rule-based data quality assessment and monitoring system in healthcare facilities</article-title><source>Improving Usability, Safety and Patient Outcomes with Health Information</source><year>2019</year><publisher-name>IOS Press</publisher-name><fpage>460</fpage><lpage>467</lpage><pub-id pub-id-type="doi">10.3233/978-1-61499-951-5-460</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Talburt</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Dagtas</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zozus</surname><given-names>MN</given-names> </name></person-group><article-title>A rule-based data quality assessment system for electronic health record data</article-title><source>Appl Clin Inform</source><year>2020</year><month>08</month><volume>11</volume><issue>04</issue><fpage>622</fpage><lpage>634</lpage><pub-id pub-id-type="doi">10.1055/s-0040-1715567</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kahn</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Raebel</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Glanz</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Riedlinger</surname><given-names>K</given-names> </name><name name-style="western"><surname>Steiner</surname><given-names>JF</given-names> </name></person-group><article-title>A pragmatic framework for single-site and multisite data quality assessment in electronic health record-based clinical research</article-title><source>Med Care</source><year>2012</year><month>07</month><volume>50 Suppl</volume><fpage>S21</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1097/MLR.0b013e318257dd67</pub-id><pub-id pub-id-type="medline">22692254</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weiskopf</surname><given-names>NG</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Methods and dimensions of electronic health record data quality assessment: enabling reuse for clinical research</article-title><source>J Am Med Inform Assoc</source><year>2013</year><month>01</month><day>1</day><volume>20</volume><issue>1</issue><fpage>144</fpage><lpage>151</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000681</pub-id><pub-id pub-id-type="medline">22733976</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Roh</surname><given-names>GH</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Ho Lee</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Woo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name></person-group><article-title>Effective DATA quality management for electronic medical record DATA using SMART DATA</article-title><source>Int J Med Inform</source><year>2023</year><month>12</month><volume>180</volume><fpage>105262</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105262</pub-id><pub-id pub-id-type="medline">37871445</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Makeleni</surname><given-names>N</given-names> </name><name name-style="western"><surname>Cilliers</surname><given-names>L</given-names> </name></person-group><article-title>Critical success factors to improve data quality of electronic medical records in public healthcare institutions</article-title><source>S Afr J Inf Manag</source><year>2021</year><volume>23</volume><issue>1</issue><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.4102/sajim.v23i1.1230</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reimer</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Milinovich</surname><given-names>A</given-names> </name><name name-style="western"><surname>Madigan</surname><given-names>EA</given-names> </name></person-group><article-title>Data quality assessment framework to assess electronic medical record data for use in research</article-title><source>Int J Med Inform</source><year>2016</year><month>06</month><volume>90</volume><fpage>40</fpage><lpage>47</lpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2016.03.006</pub-id><pub-id pub-id-type="medline">27103196</pub-id></nlm-citation></ref></ref-list></back></article>