<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e67591</article-id><article-id pub-id-type="doi">10.2196/67591</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Biases in Race and Ethnicity Introduced by Filtering Electronic Health Records for &#x201C;Complete Data&#x201D;: Observational Clinical Data Analysis</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Acitores Cortina</surname><given-names>Jose Miguel</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Fatapour</surname><given-names>Yasaman</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Brown</surname><given-names>Kathleen LaRow</given-names></name><degrees>MA</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gisladottir</surname><given-names>Undina</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zietz</surname><given-names>Michael</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bear Don't Walk IV</surname><given-names>Oliver John</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Peter</surname><given-names>Danner</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Berkowitz</surname><given-names>Jacob S</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Friedrich</surname><given-names>Nadine A</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kivelson</surname><given-names>Sophia</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kuchi</surname><given-names>Aditi</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Hongyu</given-names></name><degrees>PhD, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Srinivasan</surname><given-names>Apoorva</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tsang</surname><given-names>Kevin K</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Tatonetti</surname><given-names>Nicholas P</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Computational Biomedicine, Cedars-Sinai Medical Center</institution><addr-line>700 North San Vicente Boulevard, Pacific Design Center Suite G540</addr-line><addr-line>Los Angeles</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Cedars-Sinai Cancer, Cedars-Sinai Medical Center</institution><addr-line>Los Angeles</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Systems Biology, Columbia University</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Biomedical Informatics, Columbia University</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Biomedical Informatics and Medical Education, University of Washington</institution><addr-line>Seattle</addr-line><addr-line>WA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Weber</surname><given-names>Griffin</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Evbuomwan</surname><given-names>Oghosa</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Nicholas P Tatonetti, PhD, Department of Computational Biomedicine, Cedars-Sinai Medical Center, 700 North San Vicente Boulevard, Pacific Design Center Suite G540, Los Angeles, CA, 90069, United States, 1 424 315 1031; <email>nicholas.tatonetti@cshs.org</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>27</day><month>3</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e67591</elocation-id><history><date date-type="received"><day>15</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>06</day><month>01</month><year>2025</year></date><date date-type="accepted"><day>12</day><month>01</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jose Miguel Acitores Cortina, Yasaman Fatapour, Kathleen LaRow Brown, Undina Gisladottir, Michael Zietz, Oliver John Bear Don't Walk IV, Danner Peter, Jacob S Berkowitz, Nadine A Friedrich, Sophia Kivelson, Aditi Kuchi, Hongyu Liu, Apoorva Srinivasan, Kevin K Tsang, Nicholas P Tatonetti. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 27.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e67591"/><abstract><sec><title>Background</title><p>Integrated clinical databases from national biobanks have advanced the capacity for disease research. Data quality and completeness filters are used when building clinical cohorts to address limitations of data missingness. However, these filters may unintentionally introduce systemic biases when they are correlated with race and ethnicity.</p></sec><sec><title>Objective</title><p>In this study, we examined the race and ethnicity biases introduced by applying common filters to 4 clinical records databases. Specifically, we evaluated whether these filters introduce biases that disproportionately exclude minoritized groups.</p></sec><sec sec-type="methods"><title>Methods</title><p>We applied 19 commonly used data filters to electronic health record datasets from 4 geographically varied locations comprising close to 12 million patients to understand how using these filters introduces sample bias along racial and ethnic groupings. These filters covered a range of information, including demographics, medication records, visit details, and observation periods. We observed the variation in sample drop-off between self-reported ethnic and racial groups for each site as we applied each filter individually.</p></sec><sec sec-type="results"><title>Results</title><p>Applying the observation period filter substantially reduced data availability across all races and ethnicities in all 4 datasets. However, among those examined, the availability of data in the white group remained consistently higher compared to other racial groups after applying each filter. Conversely, the Black or African American group was the most impacted by each filter on these 3 datasets: Cedars-Sinai dataset, UK Biobank, and Columbia University dataset. Among the 4 distinct datasets, only applying the filters to the All of Us dataset resulted in minimal deviation from the baseline, with most racial and ethnic groups following a similar pattern.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our findings underscore the importance of using only necessary filters, as they might disproportionally affect data availability of minoritized racial and ethnic populations. Researchers must consider these unintentional biases when performing data-driven research and explore techniques to minimize the impact of these filters, such as probabilistic methods or adjusted cohort selection methods. Additionally, we recommend disclosing sample sizes for racial and ethnic groups both before and after data filters are applied to aid the reader in understanding the generalizability of the results. Future work should focus on exploring the effects of filters on downstream analyses.</p></sec></abstract><kwd-group><kwd>health disparities</kwd><kwd>data quality</kwd><kwd>observational research</kwd><kwd>electronic health records</kwd><kwd>racial and ethnic biases</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The rapid adoption of electronic health records (EHRs) in the past decade has greatly expanded the availability and accessibility of clinical data. This advancement enables health care professionals to harness vast amounts of information, driving medical research, personalized medicine, and overall improvements in health care delivery [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Additionally, it helps to build big data in health care, providing a foundation for advanced analytics and informed decision-making on a large scale. This supports the training and validation of artificial intelligence methodologies and models, leading to improved diagnostic accuracy, personalized treatment plans, and more efficient health care delivery. Therefore, the collected data significantly influence the results and hypotheses derived from these methods.</p><p>To improve diversity in health care, studies must include populations underrepresented in the biomedical, clinical, behavioral, and social sciences, such as individuals from racial and ethnic minority groups, those with disabilities, and people from disadvantaged backgrounds [<xref ref-type="bibr" rid="ref3">3</xref>]. Fostering diversity is vital for producing more accurate, inclusive research outcomes that reflect the needs of all populations, ultimately leading to more equitable health care and improved patient outcomes [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>However, the usefulness of available clinical data is limited if it does not reliably reflect the diversity of the underlying population. Bias in health care research refers to systematic errors or deviations that lead to inaccurate or skewed results, interpretations, or decisions. It usually occurs when certain factors, whether intentional or unintentional, disproportionately influence the research process, leading to outcomes that do not accurately represent the truth. Thus, scientific progress is delayed, flawed conclusions perpetuated, and disparities in health care outcomes are reinforced [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Lack of diversity, which may be due to systemic biases and discrimination against individuals and groups from minoritized populations, can lead to biased research outcomes that exacerbate health disparities [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. This can lead to inaccurate conclusions about treatment or interventions that may not apply equally across different populations. In addition, there are still not enough big clinical longitudinal datasets, which are essential for understanding long-term health trends, progression of diseases over time, and evaluating treatment outcomes.</p><p>When conducting observational clinical data analysis, it is often preferable to aim for a dataset that is as complete as possible. However, well-meaning filters that improve completeness may introduce unintended biases in the target population [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>Data completeness can be defined as the extent to which EHRs or other data sources capture all necessary and relevant information to accurately represent a patient&#x2019;s medical history, care processes, or outcomes. It includes both the presence and accuracy of essential data elements, such as diagnoses, treatments, laboratory results, and preventive care measures, ensuring a comprehensive and reliable foundation for clinical care, research, and quality assessment [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>In this study, we aim to evaluate the effect of data completeness filters originally used by Weber et al [<xref ref-type="bibr" rid="ref11">11</xref>] on different datasets and how various filters impact the patient cohort. This work extends the analysis to 4 large datasets, including the All of Us (AoU) dataset, UK Biobank (UKBB), and 2 geographically distinct academic medical centers. Specifically, we focus on identifying race and ethnicity biases introduced by commonly used filters [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>We examined 4 distinct data sources, AoU, UKBB, Columbia University dataset, and Cedars-Sinai dataset comprising approximately 12 million patients. By analyzing the available data and applying each filter, we aimed to investigate the potential biases these filters may introduce.</p><sec id="s2-1"><title>All of Us</title><p>The AoU study, sponsored by the National Institutes of Health, has enrolled more than 814,000 participants as of June 18, 2024, with 80% of them coming from underrepresented populations [<xref ref-type="bibr" rid="ref16">16</xref>]. These groups include racial and ethnic minorities, people with disabilities, those in rural or underserved areas, and individuals from lower socioeconomic backgrounds. Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (provided by the AoU study) showcases the self-reported races and ethnicities of the participants who have completed the initial steps of the program, providing a diverse representation. The recruitment process spans all regions of the United States.</p><p>The AoU workbench encompasses a wealth of information gathered from EHRs, including data from Fitbit devices, survey responses, and socioeconomic factors. Notably, a recent release of data in April 2023 included approximately 245,400 whole genome sequencing records and 312,940 genotyping microarrays, further enhancing the dataset&#x2019;s depth and potential for analysis.</p></sec><sec id="s2-2"><title>UK Biobank</title><p>The UKBB is a large-scale, population-based study that aims to improve the prevention, diagnosis, and treatment of various diseases. It involves the collection of extensive health-related data, including genetic information, from over 500,000 participants in the United Kingdom. Participants in the UKBB, recruited at ages 40&#x2010;69 years, were registered with the National Health Service. Researchers can download the data through the UKBB&#x2019;s Data Showcase, which collaborates closely with the European Genome Archive.</p></sec><sec id="s2-3"><title>Cedars-Sinai</title><p>Cedars-Sinai Medical Center (CSMC) is one of the largest hospitals in California, based in Los Angeles, and serves up to 1 million diverse patients every year across its 40 locations in Southern California. CSMC also serves as a large research center. The studied database comprises over 4 million patients.</p></sec><sec id="s2-4"><title>Columbia University</title><p>Columbia University Irving Medical Center (CUIMC) is a clinical, research, and educational enterprise located on a campus in Northern Manhattan. They are home to 4 colleges and schools that work on scientific research, education, and patient care. The studied database comprises over 5 million patients.</p><p>The self-reported race and ethnicity distributions of each dataset, along with the number of participants in each dataset, are presented in <xref ref-type="table" rid="table1">Table 1</xref>. In addition to those self-reported categories for race and ethnicity, we defined an all group, which includes every patient in that specific dataset. This group serves as a baseline for comparison within the dataset.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Self-reported race and ethnicity percentages of each dataset, along with the total number of participants in each dataset.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">CSMC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">CUIMC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom">All of Us</td><td align="left" valign="bottom">UKBB<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Total patients, n</td><td align="left" valign="top">4,031,307</td><td align="left" valign="top">7,121,848</td><td align="left" valign="top">287,012</td><td align="left" valign="top">502,364</td></tr><tr><td align="left" valign="top" colspan="5"><bold>Race, n (%)</bold></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>American Indian and/or Alaska Native</td><td align="left" valign="top">5695 (0.14)</td><td align="left" valign="top">8275 (0.12)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Asian</td><td align="left" valign="top">205,978 (5.11)</td><td align="left" valign="top">100,046 (1.40)</td><td align="left" valign="top">8294 (2.89)</td><td align="left" valign="top">11,472 (2.28)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Black or African</td><td align="left" valign="top">373,130 (9.26)</td><td align="left" valign="top">445,623 (6.26)</td><td align="left" valign="top">58,264 (20.30)</td><td align="left" valign="top">3552 (0.71)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Native Hawaiian or Pacific Islanders</td><td align="left" valign="top">7082 (0.18)</td><td align="left" valign="top">6693 (0.09)</td><td align="left" valign="top">344 (0.12)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>White</td><td align="left" valign="top">1,992,336 (49.42)</td><td align="left" valign="top">1,252,219 (17.58)</td><td align="left" valign="top">154,678 (53.89)</td><td align="left" valign="top">473,353 (94.22)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mixed</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">5000 (1.74)</td><td align="left" valign="top">1731 (0.34)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1,037,027 (14.56)</td><td align="left" valign="top">4727 (1.65)</td><td align="left" valign="top">10,394 (2.07)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unknown</td><td align="left" valign="top">1,447,086 (35.89)</td><td align="left" valign="top">4,271,965 (59.99)</td><td align="left" valign="top">55,705 (19.40)</td><td align="left" valign="top">1931 (0.38)</td></tr><tr><td align="left" valign="top" colspan="5"><bold>Ethnicity, n (%)</bold></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hispanic or Latino</td><td align="left" valign="top">344,708 (8.55)</td><td align="left" valign="top">657,288 (9.23)</td><td align="left" valign="top">54,054 (18.83)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Non-Hispanic or non-Latino</td><td align="left" valign="top">1,691,775 (41.97)</td><td align="left" valign="top">1,298,181 (18.23)</td><td align="left" valign="top">221,935 (77.32)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unknown</td><td align="left" valign="top">1,994,824 (49.48)</td><td align="left" valign="top">5,166,379 (72.54)</td><td align="left" valign="top">11,023 (3.84)</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>CSMC: Cedars-Sinai Medical Center.</p></fn><fn id="table1fn2"><p><sup>b</sup>CUIMC: Columbia University Irving Medical Center.</p></fn><fn id="table1fn3"><p><sup>c</sup>UKBB: UK Biobank.</p></fn><fn id="table1fn4"><p><sup>d</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p>A straightforward approach to identifying subsets of patients whose data are suitable for research studies is to use heuristic computational filters [<xref ref-type="bibr" rid="ref11">11</xref>] that exclude patients lacking various types of data in their records. For this study, we evaluated 19 different filters, which can be grouped into 3 categories. The first category is based on patient demographics. This includes filters that check whether the patient has both age and sex recorded (AgeSex), if the patient is alive at the time of the search (Alive), if the patient has a known address or zip code (Address or zip), and a set of age filters. The age filters have been applied to age at the time of any diagnosis, for example, the age filter&#x2265;65 selects patients who are 65 years or older than 65 years at the time of any of their recorded diagnoses.</p><p>The second category is a record-based filter, which checks whether patients have at least 1 recorded instance of various medical data. These filters are the presence of at least 1 diagnosis, the presence of medication records, and records for outpatient visits.</p><p>The last category is the time span or observational period filter, which selects patients who have had multiple interactions with the health care system during a specific period of time. The maximum time window for this category was the 6-year follow-up.</p><p>We used 19 filters, originally defined by Weiskopf et al [<xref ref-type="bibr" rid="ref17">17</xref>] as a metric for evaluating the completeness of EHRs, to build patient cohorts within each dataset. These filters helped identify the types of data available after their application. To maintain consistency across datasets, we applied these filters to the patient populations with EHR data in each dataset. Detailed descriptions of each filter, along with their categories, are presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Filters used in our analysis, grouped by category and descriptions of each filter.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" colspan="2">Group and filter</td><td align="left" valign="top">Description</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3"><bold>Demographics</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Alive</td><td align="left" valign="top">Patient is alive at the time of the query</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AgeSex</td><td align="left" valign="top">Patient has both sex and age recorded</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Age filter &#x2265;18</td><td align="left" valign="top">Patient has a diagnosis at an age included in the filter</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Age filter &#x2264;21</td><td align="left" valign="top">Patient has a diagnosis at an age included in the filter</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Age filter &#x2264;40</td><td align="left" valign="top">Patient has a diagnosis at an age included in the filter</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Age filter &#x2264;65</td><td align="left" valign="top">Patient has a diagnosis at an age included in the filter</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Age filter &#x2265;65</td><td align="left" valign="top">Patient has a diagnosis at an age included in the filter</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Age filter &#x2264;80</td><td align="left" valign="top">Patient has a diagnosis at an age included in the filter</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Address or zip code</td><td align="left" valign="top">Patient has an address or zip code recorded</td></tr><tr><td align="left" valign="top" colspan="3"><bold>Medical interactions</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Patient has at least 1 diagnosis recorded</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Medication</td><td align="left" valign="top">Patient has at least 1 medication prescribed and recorded</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Outpatient visit</td><td align="left" valign="top">Patient has at least 1 outpatient visit recorded</td></tr><tr><td align="left" valign="top" colspan="3"><bold>Observation period</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 week</td><td align="left" valign="top">Patient has a recorded observation period equal or longer than the filter span</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2 weeks</td><td align="left" valign="top">Patient has a recorded observation period equal or longer than the filter span</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 month</td><td align="left" valign="top">Patient has a recorded observation period equal or longer than the filter span</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">6 months</td><td align="left" valign="top">Patient has a recorded observation period equal or longer than the filter span</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 year</td><td align="left" valign="top">Patient has a recorded observation period equal or longer than the filter span</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2 years</td><td align="left" valign="top">Patient has a recorded observation period equal or longer than the filter span</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">6 years</td><td align="left" valign="top">Patient has a recorded observation period equal or longer than the filter span</td></tr></tbody></table></table-wrap><p>First, we queried our databases to get a count of all the patients, grouping them by self-reported race and ethnicity. After establishing the initial groups, we applied each filter one at a time to see the effect of that filter on sample availability.</p><p>This study aimed to identify the biases of different types of filters that are used by researchers to evaluate data completeness in electronic EHR datasets. Our focus is biases that may be introduced upon applying these filters to races and ethnicities.</p><p>We then assessed the statistical significance of the filters&#x2019; impact on different racial and ethnic subgroups through binomial testing. Comparing the expected sample to the observed filtered sample. For each group, we calculated the <italic>P</italic> values by comparing the observed proportion relative to the sum of that group and the white baseline group against an expected baseline.</p><disp-formula id="E1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">O</mml:mi><mml:mi mathvariant="normal">b</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">d</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">f</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">d</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">g</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">p</mml:mi></mml:mrow><mml:mspace width="thinmathspace"/><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">f</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">d</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">g</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">p</mml:mi></mml:mrow><mml:mspace width="thinmathspace"/><mml:mi>n</mml:mi><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">f</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">d</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">W</mml:mi><mml:mi mathvariant="normal">h</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mspace width="thinmathspace"/><mml:mi>n</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E2"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">E</mml:mi><mml:mi mathvariant="normal">x</mml:mi><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">d</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">p</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mspace width="thinmathspace"/><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">p</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mspace width="thinmathspace"/><mml:mi>n</mml:mi><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">W</mml:mi><mml:mi mathvariant="normal">h</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mspace width="thinmathspace"/><mml:mi>n</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Subsequently, we adjusted the <italic>P</italic> values for multiple hypothesis correction using the Bonferroni method.</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>The research performed complies with all relevant ethical regulations; the institutional review boards (IRBs) that approved the study protocol are Columbia (IRB AAAL0601) and CSMC (IRB STUDY00003395). Patients were enrolled under a waiver of consent in CSMC and CUIMC. Consent in AoU and UKBB is managed by each platform, respectively. During the study, we only accessed nonprotected health information and total counts, maintaining the confidentiality of every patient.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>We applied the filters to each dataset separately to assess their individual effects. <xref ref-type="table" rid="table3">Table 3</xref> indicates the percentage of patients remaining after applying each filter for each dataset.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Percentage of population remaining on each dataset after applying the different filters.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Filter</td><td align="left" valign="bottom">Cedars-Sinai, n (%)</td><td align="left" valign="bottom">Columbia, n (%)</td><td align="left" valign="bottom">All of Us, n (%)</td><td align="left" valign="bottom">UKBB<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Alive</td><td align="left" valign="top">3,852,435 (95.56)</td><td align="left" valign="top">6,774,829 (95.12)</td><td align="left" valign="top">283,806 (98.88)</td><td align="left" valign="top">467,333 (93.02)</td></tr><tr><td align="left" valign="top">AgeSex</td><td align="left" valign="top">4,026,072 (99.87)</td><td align="left" valign="top">6,002,422 (84.28)</td><td align="left" valign="top">28,701 (100)</td><td align="left" valign="top">502,364 (100)</td></tr><tr><td align="left" valign="top">Age filter &#x2265;18</td><td align="left" valign="top">1,624,951 (40.30)</td><td align="left" valign="top">4,286,839 (60.19)</td><td align="left" valign="top">253,948 (88.48)</td><td align="left" valign="top">229,960 (45.77)</td></tr><tr><td align="left" valign="top">Age filter &#x2264;21</td><td align="left" valign="top">127,771 (3.16)</td><td align="left" valign="top">1,328,385 (18.65)</td><td align="left" valign="top">21,881 (7.62)</td><td align="left" valign="top">82,953 (16.51)</td></tr><tr><td align="left" valign="top">Age filter &#x2264;40</td><td align="left" valign="top">733,191 (18.18)</td><td align="left" valign="top">2,814,216 (39.51)</td><td align="left" valign="top">103,431 (36.04)</td><td align="left" valign="top">178,503 (35.53)</td></tr><tr><td align="left" valign="top">Age filter &#x2264;65</td><td align="left" valign="top">1,348,000 (33.43)</td><td align="left" valign="top">4,393,121 (61.98)</td><td align="left" valign="top">227,072 (79.12)</td><td align="left" valign="top">225,790 (44.49)</td></tr><tr><td align="left" valign="top">Age filter &#x2265;65</td><td align="left" valign="top">424,790 (10.53)</td><td align="left" valign="top">1,170,388 (16.43)</td><td align="left" valign="top">81,938 (28.55)</td><td align="left" valign="top">122,327 (24.35)</td></tr><tr><td align="left" valign="top">Age filter &#x2264;80</td><td align="left" valign="top">1,584,518 (39.30)</td><td align="left" valign="top">4,992,410 (70.09)</td><td align="left" valign="top">252,875 (88.10)</td><td align="left" valign="top">229,961 (45.77)</td></tr><tr><td align="left" valign="top">Has address or zip</td><td align="left" valign="top">1,829,303 (45.37)</td><td align="left" valign="top">4,024,099 (56.50)</td><td align="left" valign="top">287,007 (99.99)</td><td align="left" valign="top">148,261 (29.51)</td></tr><tr><td align="left" valign="top">Has medications</td><td align="left" valign="top">1,316,735 (32.66)</td><td align="left" valign="top">2,513,715 (35.29)</td><td align="left" valign="top">239,691 (83.51)</td><td align="left" valign="top">368,599 (77.37)</td></tr><tr><td align="left" valign="top">Has diagnoses</td><td align="left" valign="top">1,663,429 (41.26)</td><td align="left" valign="top">5,158,066 (72.42)</td><td align="left" valign="top">254,449 (88.66)</td><td align="left" valign="top">466,982 (92.95)</td></tr><tr><td align="left" valign="top">Has outpatient visits</td><td align="left" valign="top">2,963,959 (73.25)</td><td align="left" valign="top">2,893,964 (40.63)</td><td align="left" valign="top">286,214 (99.72)</td><td align="left" valign="top">230,078 (45.79)</td></tr><tr><td align="left" valign="top">Observation period 1 week</td><td align="left" valign="top">2,242,853 (55.63)</td><td align="left" valign="top">4,993,424 (70.11)</td><td align="left" valign="top">5254 (1.83)</td><td align="left" valign="top">223,398 (44.46)</td></tr><tr><td align="left" valign="top">Observation period 2 weeks</td><td align="left" valign="top">2,190,443 (54.33)</td><td align="left" valign="top">4,937,814 (69.33)</td><td align="left" valign="top">4916 (1.71)</td><td align="left" valign="top">223,203 (44.43)</td></tr><tr><td align="left" valign="top">Observation period 1 month</td><td align="left" valign="top">2,121,948 (52.63)</td><td align="left" valign="top">4,855,922 (68.18)</td><td align="left" valign="top">4398 (1.53)</td><td align="left" valign="top">222,915 (44.37)</td></tr><tr><td align="left" valign="top">Observation period 6 months</td><td align="left" valign="top">1,889,240 (46.86)</td><td align="left" valign="top">4,240,529 (59.54)</td><td align="left" valign="top">2938 (1.02)</td><td align="left" valign="top">222,006 (44.19)</td></tr><tr><td align="left" valign="top">Observation period 1 year</td><td align="left" valign="top">1,734,906 (43.03)</td><td align="left" valign="top">4,082,843 (57.32)</td><td align="left" valign="top">2602 (1)</td><td align="left" valign="top">221,321 (44.05)</td></tr><tr><td align="left" valign="top">Observation period 2 years</td><td align="left" valign="top">1,541,480 (38.23)</td><td align="left" valign="top">3,880,143 (54.48)</td><td align="left" valign="top">2241 (1)</td><td align="left" valign="top">220,300 (43.85)</td></tr><tr><td align="left" valign="top">Observation period 6 years</td><td align="left" valign="top">733,203 (18.18)</td><td align="left" valign="top">3,151,463 (44.25)</td><td align="left" valign="top">2007 (1)</td><td align="left" valign="top">217,257 (43.24)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>UKBB: UK Biobank.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Cedars-Sinai</title><p><xref ref-type="fig" rid="figure1">Figures 1A</xref> and <xref ref-type="fig" rid="figure2">2A</xref> show the percentage of available patients in the CSMC after applying each filter. The results show that both unknown race and unknown ethnicity are the most affected groups when applying the filters. This causes the values for the group all to decrease too, and it is shown across every cohort.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Available percentage of patients&#x2019; data upon individually applying all 19 filters in different ethnic subgroups in (A) the Cedars-Sinai dataset, (B) the CUIMC dataset, and (C) the AoU dataset. The filters are in descending order following the available percentage of the category, all. The points are connected to ease the visualization, but the filters are not cumulative. Stacked bar plots show the ethnicity distribution of the datasets in percentages. Stacked bar plot from CSMC has 8.55% (n=344,708) of Hispanic or Latino, 41.97% (n=1,691,775) of non-Hispanic or non-Latino, and 49.48% (n=1,994,824) of unknown ethnicity. Stacked bar plot from CUIMC has 9.23% (n=657,288) of Hispanic or Latino, 18.23% (n=1,298,181) of non-Hispanic or non-Latino, and 72.54% (n=5,166,379) of unknown ethnicity. Stacked bar plot from AoU has 18.83% (n=54,054) of Hispanic or Latino, 77.32% (n=221,935) of non-Hispanic or non-Latino, and 3.84% (n=11,023) of unknown ethnicity. The available percentage values can be found in Tables S2-S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. AoU: All of Us; CSMC: Cedars-Sinai Medical Center; CUIMC: Columbia University Irving Medical Center.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e67591_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Available percentage of patients&#x2019; data upon individually applying all 19 filters in different racial subgroups in (A) the Cedars-Sinai dataset, (B) the CUIMC dataset, (C) and the AoU dataset. The filters are in descending order following the available percentage of the category, all. The points are connected to ease the visualization, but the filters are not cumulative. Stacked bar plots show the race distribution of the datasets in percentages. Stacked bar plot from CSMC has 0.14% (n=5695) of American Indian and/or Alaska Native, 5.11% (n=205,978) of Asian, 9.26% (n=373,130) of Black or African, 0.18% (n=7082) of Hawaiian Native or Pacific Islander, 49.42% (n=1,992,336) of white, and 35.89% (n=1,447,086) of unknown race. Stacked bar plot from CUIMC has 0.12% (n=8275) of American Indian and/or Alaska Native, 1.40% (n=100,046) of Asian, 6.26% (n=445,623) of Black or African, 0.09% (n=6693) of Hawaiian Native or Pacific Islander, 17.58% (n=1,252,219) of white, 14.56% (n=1,037,027) of another race, and 59.99% (n=4,271,965) of unknown race. Stacked bar plot from AoU has 2.89% (n=8294) of Asian, 20.30% (n=58,264) of Black or African, 0.12% (n=344) of Hawaiian Native or Pacific Islander, 53.89% (n=154,678) of white, 1.74% (n=5000) of mixed race, 1.65% (n=4727) of another race, and 19.40% (n=55,705) of unknown race. The available percentage values can be found in Tables S2-S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. AoU: All of Us; CSMC: Cedars-Sinai Medical Center; CUIMC: Columbia University Irving Medical Center.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e67591_fig02.png"/></fig><p>The results of the CSMC cohort show that every known race or ethnicity group is above &#x201C;all&#x201D; in almost every filter. However, both unknown race and unknown ethnicity are the most affected groups when applying the filters. This causes the values for the group all to decrease too, and it is shown across every cohort.</p><p>Nevertheless, it is important to note that in this dataset, the Black or African American population is the most affected group by the filters, being significantly more affected than the white population in 16 of 19 filters, as seen in Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Additionally, as the observation period filter increases, every race group becomes more affected than the white population.</p></sec><sec id="s3-3"><title>All of Us</title><p>For the AoU dataset, we applied the filters to the cohort of patients in the controlled tier 7 who had EHR records. This process reduced the number of patients from 410,235 to 287,012. Upon applying age or sex, medication, zip code or address (in this dataset, we have state of residence, so we used that instead of zip code), alive status, and outpatient visits, the initial cohort remained largely unchanged. However, as more stringent age filters were applied and the observational period was extended, the cohort population significantly decreased. Among all the races, the Asian group was most noticeably impacted, particularly when the observation period filter was applied, as shown in <xref ref-type="fig" rid="figure2">Figure 2A</xref>. Within this dataset, unlike at CSMC, the majority of racial and ethnic groups follow the same pattern when each filter is applied. It is remarkable that most of the groups experience significant data loss compared to the White group on the same filters, as shown in Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-4"><title>Columbia University Irving Medical Center</title><p>Similarly to the cohort from CSMC, the unknown race or ethnicity and other values decrease the most when applying the filters, bringing down the overall percentage. The known races or ethnicities are again above the all group&#x2019;s percentage in almost every category. It is important to note that unknown race and ethnicity represent close to 60%(n=4,271,965) and 72%(n=5,166,379) of Columbia University Medical Center&#x2019;s cohort, respectively, contributing to the low baseline percentage for the all group.</p><p>Of the known races and ethnicities, we can see in <xref ref-type="fig" rid="figure1">Figures 1B</xref> and <xref ref-type="fig" rid="figure2">2B</xref> that the American Indian and/or Alaska Native population is the most significantly affected by the filters, 18 of 19, even crossing the all line. This is followed by Black or African American, which takes the second place in 16 of 19. However, contrary to the CSMC cohort, the non-Hispanic or non-Latino ethnicity is the most affected by the filters, 10 of 19 filters.</p></sec><sec id="s3-5"><title>UK Biobank</title><p>In the UKBB, race and ethnicity classifications differ from those used in American institutions. To ensure consistency in data presentation, we applied the UK government&#x2019;s recommended grouping strategy [<xref ref-type="bibr" rid="ref18">18</xref>], aligning it with the US classification system for comparability. We included any other Black background, African, Black or Black British, and Caribbean under the category Black or African origin; any other Asian background, Asian or Asian British, Bangladeshi, Chinese, Indian, and Pakistani under the category Asian; any other white background, British, Irish, and white under the category white; do not know and prefer not to answer under unknown; any other mixed background, mixed, white and Asian, white and Black African, and white and Black Caribbean under mixed; and finally, other ethnic group under other.</p><p>After this grouping, there are some aspects to remark on from this dataset, white population represents close to 94%(n=473,353) of the group, which biases completely the all results. Having that in mind, we can see in <xref ref-type="fig" rid="figure3">Figure 3</xref> how every other race was impacted more than the baseline, especially the Black or African origin group.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Available percentage of patients&#x2019; data upon individually applying all 19 filters in different racial subgroups (<bold>A</bold>) in the UKBB dataset. The filters are in descending order following the available percentage of the category, all. The points are connected to ease the visualization, but the filters are not cumulative. Stacked bar plots show the race distribution of the datasets in percentages. (B) Stacked bar plot shows the racial distribution of the dataset in percentages, showing 2.28% (n=11,472) of Asian, 0.71% (n=3552) of Black or African, 94.22% (n=473,353) of white, 0.34% (n=1731) of mixed race, 2.07% (n=10,394) of another race, and 0.38% (n=1931) of unknown race. The available percentage values can be found in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. UKBB: UK Biobank.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e67591_fig03.png"/></fig><p>We then analyzed the differences within the most prevalent groups in this dataset, evaluating only the 5 most common categories. We found that the British group accounts for 91%(n=442,973) of the total, only counting the top 5 groups. This approach yielded results similar to those of the complete one. The first 5 categories, in order of percentage, were the following: British (91.7%, n=442,973), ny other white background (3.4%, n=16,455), Irish (2.8%,n=13,346), Indian (1.2%, n=5,955), and other ethnic group (1%, n=4,609). These percentages account for the addition of the population of the top 5 groups and not the total.</p><p>A low adjusted <italic>P</italic> value (eg, &#x003C;.001) suggests that the subgroup is significantly more affected by the specific filter than the white group used as a baseline. We see across every dataset that most of the groups are significantly more affected by the filter than our baseline, but it is particularly notable in groups like Black or African American and American Indian and/or Alaska Native. The Hispanic or Latino ethnic group also shows more significant data loss than non-Hispanic or non-Latino across every dataset. The <italic>P</italic> values can be found in Tables S6-S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Overview</title><p>Our study investigates the potential racial and ethnic biases introduced by applying common data quality and completeness filters in clinical research databases, including AoU, UKBB, and 2 academic medical centers. We analyzed 19 filters across approximately 12 million patients and discovered that certain filters significantly reduce data availability and have a differential effect on racial and ethnic groups.</p><p>The challenge with these filters lies in distinguishing between patients with missing data who could be relatively healthy, have not recently sought medical care, or have limited access to health care systems. Each group will have a low number of data entries in their records. Consequently, these filters might bias the resulting cohort by selecting sicker patients who interact with the health care system more frequently and/or those who have more access to health care systems. For example, in a cohort of 10,000 patients, those with poorer health status had more laboratory tests and medication orders, resulting in more comprehensive data in their records [<xref ref-type="bibr" rid="ref19">19</xref>]. On the other hand, minoritized populations usually have less access to health care [<xref ref-type="bibr" rid="ref20">20</xref>], which affects the data&#x2019;s completeness and reduces their data points when we apply different types of filters. We focused on bringing attention to the second point.</p></sec><sec id="s4-2"><title>Principal Findings</title><p>Throughout the analysis of the 4 different cohorts, a consistent pattern emerged: applied filters disproportionately affected minoritized groups, particularly the Black or African American group, which consistently has one of the lowest data availabilities across all datasets, and the American Indian and/or Alaska Native group. These filters significantly reduced the already limited data points for minoritized groups, further diminishing the completeness and usability of their data compared to white or non-Latino patients. We observe a similar pattern in the Hispanic or Latino group, where data availability is consistently lower in every cohort compared to the non-Hispanic or non-Latino group.</p><p>In the self-reported race groups, we observe that almost every group has less data availability than the white group, which is the largest within the known self-reported races across all datasets except at CSMC. At CSMC, the most complete group varies by filter, alternating between Asian, Native Hawaiian or Pacific Islander, American Indian and/or Alaska Native, and white. In contrast, in the CUIMC dataset, the American Indian and/or Alaska Native group has the lowest data availability.</p><p>Among the 4 distinct datasets, only the AoU dataset closely reflects the diversity of the US population, with approximately 50%(n=132,334) of the data representing populations other than white. Upon applying different filters on this dataset, as shown in <xref ref-type="fig" rid="figure1">Figures 1C</xref> and <xref ref-type="fig" rid="figure2">2C</xref>, most groups follow the same original pattern prior to applying the filters and deviate from the baseline to a lesser extent, demonstrating that it is possible to achieve a diverse and complete dataset.</p><p>Dataset diversity is essential for enhancing the generalizability and inclusivity of clinical research, addressing disparities, and improving health care outcomes for underrepresented populations. The AoU dataset, designed as a nationwide research program, aims to collect health data from a diverse population and succeeds at creating an equitable framework for research where most of the groups share the same data availability percentage. In contrast, the CUIMC and CSMC datasets reflect the specific patient populations of their respective regions, leading to localized diversity compared to the national scope of the AoU dataset. However, both are based in highly racially and ethnically diverse US cities, giving them a unique advantage over other institutional-level datasets. The information for the different populations and distributions of the locations for the 4 datasets can be found in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Both in the United States and the United Kingdom, the white population constitutes the majority. Minorities, as defined by the US Office of Management and Budget, include racial and ethnic groups such as American Indian, Alaska Native, Asian, Black or African American, and Native Hawaiian or Pacific Islander. These groups often face health disparities, which can result in reduced access to health care and underrepresentation in research cohorts [<xref ref-type="bibr" rid="ref21">21</xref>]. This underrepresentation may lead to inaccurate clinical care decisions, skewed genetic associations, and suboptimal treatment strategies.</p></sec><sec id="s4-3"><title>Limitations</title><p>Our findings highlight the importance of carefully selecting filters to ensure equitable research outcomes, particularly for minority populations. While we do not claim these are the most frequently used filters by researchers, nor the optimal ones for selecting patients with complete data, it is essential to investigate any potential biases that may be introduced upon applying each filter before conducting research on these populations.</p><p>Additionally, methods to mitigate bias must be used when possible. One example is artificial intelligence&#x2013;driven synthetic data generation for bias mitigation, which can be done using different methodologies such as generative adversarial networks, synthetic minority oversampling, or Bayesian networks [<xref ref-type="bibr" rid="ref22">22</xref>]. Other techniques include reweighting, suppression, or multiple imputation [<xref ref-type="bibr" rid="ref23">23</xref>]. Advanced statistical techniques like inverse probability weighting can also help address these challenges and enhance dataset diversity [<xref ref-type="bibr" rid="ref24">24</xref>].</p></sec><sec id="s4-4"><title>Future Directions</title><p>Future work should focus on understanding how the application of these filters affects the results of common downstream analyses, such as disease risk prediction tasks and genome-wide association studies, and how to improve existing techniques for bias mitigation. We also recommend that researchers begin including sample sizes for relevant racial and ethnic groups both before and after any used filters are applied so that readers can better contextualize the results of the study.</p><p>Addressing disparities in representation is critical to creating research cohorts that accurately reflect the target population. This work underscores the challenges of achieving data completeness and proper representation of racial and ethnic populations and other minoritized groups in clinical research. Strategies to mitigate these disparities, along with careful consideration of filters, are crucial for ensuring equitable research outcomes and enhancing the inclusivity of health datasets.</p></sec><sec id="s4-5"><title>Conclusions</title><p>Our findings underscore the importance of using only necessary filters, as they may affect the diversity and completeness of sample data, which particularly affects underrepresented populations. Upon applying different filters to the 4 distinct datasets, we observed that only the AoU dataset maintained the original sample distribution along racial and ethnic groupings, with minimal deviation from the baseline, demonstrating the potential to achieve a diverse and complete dataset.</p><p>Researchers must consider their target population when conducting studies and proactively address unintentional biases that may arise in data-driven research as well as the impact of these biases on downstream analyses. While sample filters are often necessary, we recommend that researchers implement techniques to mitigate biases and provide sample size information across racial and ethnic groupings both before and after the filters are applied, so readers can better understand the generalizability of the study.</p><p>Future work should characterize how the application of these filters affects downstream analyses and improve on existing techniques to minimize their impact. We strive to achieve a state where the datasets accurately represent the target population of the studies and where research studies are performed on the same population that institutions serve.</p></sec></sec></body><back><ack><p>NAF was supported by NIH T32 HL116273.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AoU</term><def><p>All of Us</p></def></def-item><def-item><term id="abb2">CSMC</term><def><p>Cedars-Sinai Medical Center</p></def></def-item><def-item><term id="abb3">CUIMC</term><def><p>Columbia University Irving Medical Center</p></def></def-item><def-item><term id="abb4">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb5">IRB</term><def><p>institutional review board</p></def></def-item><def-item><term id="abb6">UKBB</term><def><p>UK Biobank</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Toh</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Brody</surname><given-names>J</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Yen Kheng</surname><given-names>T</given-names> </name></person-group><article-title>Applications of machine learning in healthcare</article-title><source>Smart Manufacturing&#x2014;When Artificial Intelligence Meets the Internet of Things</source><year>2021</year><publisher-name>IntechOpen</publisher-name><pub-id pub-id-type="doi">10.5772/intechopen.92297</pub-id><pub-id pub-id-type="other">978-1-83881-087-0</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Javaid</surname><given-names>M</given-names> </name><name name-style="western"><surname>Haleem</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pratap Singh</surname><given-names>R</given-names> </name><name name-style="western"><surname>Suman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rab</surname><given-names>S</given-names> </name></person-group><article-title>Significance of machine learning in healthcare: features, pillars and applications</article-title><source>International Journal of Intelligent Networks</source><year>2022</year><volume>3</volume><fpage>58</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.1016/j.ijin.2022.05.002</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Togioka</surname><given-names>B</given-names> </name><name name-style="western"><surname>Duvivier</surname><given-names>D</given-names> </name><name name-style="western"><surname>Young</surname><given-names>E</given-names> </name></person-group><source>Diversity and Discrimination in Health Care</source><year>2024</year><publisher-name>StatPearls</publisher-name></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>National Academies of Sciences, Engineering, and Medicine</collab><collab>Policy and Global Affairs</collab><collab>Committee on Women in Science, Engineering, and Medicine</collab></person-group><source>Improving Representation in Clinical Trials and Research: Building Research Equity for Women and Underrepresented Groups Why Diverse Representation in Clinical Research Matters and the Current State of Representation within the Clinical Research Ecosystem</source><year>2022</year><publisher-name>National Academies Press (US)</publisher-name></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Palaniappan</surname><given-names>L</given-names> </name></person-group><article-title>Improving diversity in medical research</article-title><source>Nat Rev Dis Primers</source><year>2021</year><month>10</month><day>14</day><volume>7</volume><issue>1</issue><fpage>74</fpage><pub-id pub-id-type="doi">10.1038/s41572-021-00316-8</pub-id><pub-id pub-id-type="medline">34650078</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hammond</surname><given-names>MEH</given-names> </name><name name-style="western"><surname>Stehlik</surname><given-names>J</given-names> </name><name name-style="western"><surname>Drakos</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Kfoury</surname><given-names>AG</given-names> </name></person-group><article-title>Bias in medicine</article-title><source>JACC: Basic to Translational Science</source><year>2021</year><month>01</month><volume>6</volume><issue>1</issue><fpage>78</fpage><lpage>85</lpage><pub-id pub-id-type="doi">10.1016/j.jacbts.2020.07.012</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adan</surname><given-names>C</given-names> </name></person-group><article-title>The importance of diversity in clinical research</article-title><source>Br J Nurs</source><year>2023</year><month>10</month><day>12</day><volume>32</volume><issue>18</issue><fpage>898</fpage><lpage>901</lpage><pub-id pub-id-type="doi">10.12968/bjon.2023.32.18.898</pub-id><pub-id pub-id-type="medline">37830855</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landry</surname><given-names>LG</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>N</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>DR</given-names> </name><name name-style="western"><surname>Rehm</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Bonham</surname><given-names>VL</given-names> </name></person-group><article-title>Lack of diversity in genomic databases is a barrier to translating precision medicine research into practice</article-title><source>Health Aff (Millwood)</source><year>2018</year><month>05</month><volume>37</volume><issue>5</issue><fpage>780</fpage><lpage>785</lpage><pub-id pub-id-type="doi">10.1377/hlthaff.2017.1595</pub-id><pub-id pub-id-type="medline">29733732</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ramamoorthy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pacanowski</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Bull</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name></person-group><article-title>Racial/ethnic differences in drug disposition and response: review of recently approved drugs</article-title><source>Clin Pharmacol Ther</source><year>2015</year><month>03</month><volume>97</volume><issue>3</issue><fpage>263</fpage><lpage>273</lpage><pub-id pub-id-type="doi">10.1002/cpt.61</pub-id><pub-id pub-id-type="medline">25669658</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>LT</given-names> </name><name name-style="western"><surname>Watkins</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pi&#x00F1;a</surname><given-names>IL</given-names> </name><etal/></person-group><article-title>Increasing diversity in clinical trials: overcoming critical barriers</article-title><source>Curr Probl Cardiol</source><year>2019</year><month>05</month><volume>44</volume><issue>5</issue><fpage>148</fpage><lpage>172</lpage><pub-id pub-id-type="doi">10.1016/j.cpcardiol.2018.11.002</pub-id><pub-id pub-id-type="medline">30545650</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weber</surname><given-names>GM</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>WG</given-names> </name><name name-style="western"><surname>Bernstam</surname><given-names>EV</given-names> </name><etal/></person-group><article-title>Biases introduced by filtering electronic health records for patients with &#x201C;complete data&#x201D;</article-title><source>J Am Med Inform Assoc</source><year>2017</year><month>11</month><day>1</day><volume>24</volume><issue>6</issue><fpage>1134</fpage><lpage>1141</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocx071</pub-id><pub-id pub-id-type="medline">29016972</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heintzman</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bailey</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Hoopes</surname><given-names>MJ</given-names> </name><etal/></person-group><article-title>Agreement of Medicaid claims and electronic health records for assessing preventive care quality among adults</article-title><source>J Am Med Inform Assoc</source><year>2014</year><month>07</month><day>1</day><volume>21</volume><issue>4</issue><fpage>720</fpage><lpage>724</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-002333</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Devoe</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Gold</surname><given-names>R</given-names> </name><name name-style="western"><surname>McIntire</surname><given-names>P</given-names> </name><name name-style="western"><surname>Puro</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chauvie</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gallia</surname><given-names>CA</given-names> </name></person-group><article-title>Electronic health records vs Medicaid claims: completeness of diabetes preventive care data in community health centers</article-title><source>Ann Fam Med</source><year>2011</year><volume>9</volume><issue>4</issue><fpage>351</fpage><lpage>358</lpage><pub-id pub-id-type="doi">10.1370/afm.1279</pub-id><pub-id pub-id-type="medline">21747107</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hersh</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Weiner</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Embi</surname><given-names>PJ</given-names> </name><etal/></person-group><article-title>Caveats for the use of operational electronic health record data in comparative effectiveness research</article-title><source>Med Care</source><year>2013</year><month>08</month><volume>51</volume><issue>8 Suppl 3</issue><fpage>S30</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1097/MLR.0b013e31829b1dbd</pub-id><pub-id pub-id-type="medline">23774517</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gopal</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Chetty</surname><given-names>U</given-names> </name><name name-style="western"><surname>O&#x2019;Donnell</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gajria</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blackadder-Weinstein</surname><given-names>J</given-names> </name></person-group><article-title>Implicit bias in healthcare: clinical practice, research and decision making</article-title><source>Future Healthc J</source><year>2021</year><month>03</month><volume>8</volume><issue>1</issue><fpage>40</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.7861/fhj.2020-0233</pub-id><pub-id pub-id-type="medline">33791459</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Data snapshots (public data)</article-title><source>All of Us Research Program</source><access-date>2025-02-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.researchallofus.org/data-tools/data%20snapshots/">https://www.researchallofus.org/data-tools/data%20snapshots/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weiskopf</surname><given-names>NG</given-names> </name><name name-style="western"><surname>Hripcsak</surname><given-names>G</given-names> </name><name name-style="western"><surname>Swaminathan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Defining and measuring completeness of electronic health records for secondary use</article-title><source>J Biomed Inform</source><year>2013</year><month>10</month><volume>46</volume><issue>5</issue><fpage>830</fpage><lpage>836</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2013.06.010</pub-id><pub-id pub-id-type="medline">23820016</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>List of ethnic groups</article-title><source>National Health System of UK</source><access-date>2025-02-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ethnicity-facts-figures.service.gov.uk/style-guide/ethnic-groups/">https://www.ethnicity-facts-figures.service.gov.uk/style-guide/ethnic-groups/</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rusanov</surname><given-names>A</given-names> </name><name name-style="western"><surname>Weiskopf</surname><given-names>NG</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Hidden in plain sight: bias towards sick patients when sampling patients with sufficient electronic health record data for research</article-title><source>BMC Med Inform Decis Mak</source><year>2014</year><month>06</month><day>11</day><volume>14</volume><issue>1</issue><fpage>51</fpage><pub-id pub-id-type="doi">10.1186/1472-6947-14-51</pub-id><pub-id pub-id-type="medline">24916006</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>National Academies of Sciences, Engineering, and Medicine</collab><collab>Health and Medicine Division</collab><collab>Board on Population Health and Public Health Practice</collab><collab>Committee on Community-Based Solutions to Promote Health Equity in the United States</collab></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Baciu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Geller</surname><given-names>A</given-names> </name></person-group><article-title>The state of health disparities in the United States</article-title><source>Communities in Action: Pathways to Health Equity</source><year>2017</year><publisher-name>National Academies Press (US)</publisher-name></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>Minority health and health disparities definitions</article-title><source>National Institute of Minority Health and Health Disparities</source><access-date>2025-02-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nimhd.nih.gov/resources/understanding-health-disparities/minority-health-and-health-disparities-definitions.html">https://www.nimhd.nih.gov/resources/understanding-health-disparities/minority-health-and-health-disparities-definitions.html</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shahul Hameed</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Qureshi</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Kaushik</surname><given-names>A</given-names> </name></person-group><article-title>Bias mitigation via synthetic data generation: a review</article-title><source>Electronics (Basel)</source><year>2024</year><month>10</month><day>2</day><volume>13</volume><issue>19</issue><fpage>3909</fpage><pub-id pub-id-type="doi">10.3390/electronics13193909</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nazer</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Zatarah</surname><given-names>R</given-names> </name><name name-style="western"><surname>Waldrip</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Bias in artificial intelligence algorithms and recommendations for mitigation</article-title><source>PLOS Digit Health</source><year>2023</year><month>06</month><volume>2</volume><issue>6</issue><fpage>e0000278</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000278</pub-id><pub-id pub-id-type="medline">37347721</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Seaman</surname><given-names>SR</given-names> </name><name name-style="western"><surname>White</surname><given-names>IR</given-names> </name></person-group><article-title>Review of inverse probability weighting for dealing with missing data</article-title><source>Stat Methods Med Res</source><year>2013</year><month>06</month><volume>22</volume><issue>3</issue><fpage>278</fpage><lpage>295</lpage><pub-id pub-id-type="doi">10.1177/0962280210395740</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional information and detailed information of the results.</p><media xlink:href="medinform_v13i1e67591_app1.docx" xlink:title="DOCX File, 193 KB"/></supplementary-material></app-group></back></article>