<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e72068</article-id><article-id pub-id-type="doi">10.2196/72068</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Unsupervised Coverage Sampling to Enhance Clinical Chart Review Coverage for Computable Phenotype Development: Simulation and Empirical Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Zigui</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hurst</surname><given-names>Jillian H</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hong</surname><given-names>Chuan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Goldstein</surname><given-names>Benjamin Alan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biostatistics and Bioinformatics, Duke University School of Medicine, Duke University</institution><addr-line>2424 Erwin Road, 9023 Hock Plaza</addr-line><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Pediatrics, Duke University School of Medicine, Duke University</institution><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chang</surname><given-names>Feier</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Borza</surname><given-names>Victor</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Benjamin Alan Goldstein, PhD, Department of Biostatistics and Bioinformatics, Duke University School of Medicine, Duke University, 2424 Erwin Road, 9023 Hock Plaza, Durham, NC, 27705, United States, +1 919-691-5011; <email>ben.goldstein@duke.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>27</day><month>11</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e72068</elocation-id><history><date date-type="received"><day>03</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>09</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>09</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Zigui Wang, Jillian H Hurst, Chuan Hong, Benjamin Alan Goldstein. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 27.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e72068"/><abstract><sec><title>Background</title><p>Developing computable phenotypes (CP) based on electronic health records (EHR) data requires &#x201C;gold-standard&#x201D; labels for the outcome of interest. To generate these labels, clinicians typically chart-review a subset of patient charts. Charts to be reviewed are most often randomly sampled from the larger set of patients of interest. However, random sampling may fail to capture the diversity of the patient population, particularly if smaller subpopulations exist among those with the condition of interest. This can lead to poorly performing and biased CPs.</p></sec><sec><title>Objective</title><p>This study aimed to propose an unsupervised sampling approach designed to better capture a diverse patient cohort and improve the information coverage of chart review samples.</p></sec><sec sec-type="methods"><title>Methods</title><p>Our coverage sampling method starts by clustering by the patient population of interest. We then perform a stratified sampling from each cluster to ensure even representation for the chart review sample. We introduce a novel metric, nearest neighbor distance, to evaluate the coverage of the generated sample. To evaluate our method, we first conducted a simulation study to model and compare the performance of random versus our proposed coverage sampling. We varied the size and number of subpopulations within the larger cohort. Finally, we apply our approach to a real-world data set to develop a CP for hospitalization due to COVID-19. We evaluate the different sampling strategies based on the information coverage as well as the performance of the learned CP, using the area under the receiver operator characteristic curve.</p></sec><sec sec-type="results"><title>Results</title><p>Our simulation studies show that the unsupervised coverage sampling approach provides broader coverage of patient populations compared to random sampling. When there are no underlying subpopulations, both random and coverage perform equally well for CP development. When there are subgroups, coverage sampling achieves area under the receiver operating characteristic curve gains of approximately 0.03&#x2010;0.05 over random sampling. In the real-world application, the approach also outperformed random sampling, generating both a more representative sample and an area under the receiver operating characteristic curve improvement of 0.02 (95% CI &#x2212;0.08 to 0.04).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The proposed coverage sampling method is an easy-to-implement approach that produces a chart review sample that is more representative of the source population. This allows one to learn a CP that has better performance both for subpopulations and the overall cohort. Studies that aim to develop CPs should consider alternative strategies other than randomly sampling patient charts.</p></sec></abstract><kwd-group><kwd>electronic health records</kwd><kwd>EHR</kwd><kwd>chart review sampling</kwd><kwd>coverage metric</kwd><kwd>computable phenotypes</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Electronic health records (EHR) data are widely used in clinical research. While they contain dense, often granular information on a patient&#x2019;s health status, they also pose challenges for clinical studies since they lack explicit documentation for the reason for the health care encounter (eg, admission due to infection). In principle, the problem list, which provides a historical listing of previous health problems, can be used to identify chronic conditions, though it is often unreliable [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Similarly, fields such as discharge diagnosis may not accurately represent the reason a patient had a visit. Instead, information from diagnosis codes, laboratory test results, and prescriptions or administered medications is used to indicate the presence of a specific clinical condition [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. This is a well-known challenge in working with EHR data and has led to the growth of computable phenotypes (CPs). CPs are algorithms, typically Boolean, though sometimes probabilistic, that use multiple sources of clinical data&#x2014;such as diagnoses, laboratory results, and medication records&#x2014;to infer the clinical condition of a patient or the reason for a visit [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Creating CPs is a multiphase process that often requires significant collaborative effort from clinicians and informaticians [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. One of the key components in CP development is the creation of a set of &#x201C;gold standard&#x201D; outcome labels. The outcome labels are typically generated based on manual review of a subset of eligible patient charts, which can require significant time [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. The set of charts that are used to develop these gold-standard labels is usually sampled randomly [<xref ref-type="bibr" rid="ref11">11</xref>]. While random sampling will, on average, produce a representative view of the population of interest, since one usually wants to review only a small number of charts, random sampling may not adequately represent the complete range of disease presentations or patient demographics. In the scenario shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>, subgroups that have a rarer presentation within the larger data set (eg, rarer presentations of the disease of interest and disease presentation in minority subgroups) are less likely to be adequately covered based on random sampling. In this case, much larger sample sets are necessary to find a meaningful number of charts from people from these subgroups [<xref ref-type="bibr" rid="ref12">12</xref>]. In such scenarios, random sampling strategies might not be effective in generating a sample covering all subgroups for chart review purposes and result in a CP that does not accurately capture the heterogeneity of the condition of interest. This can lead to a CP that performs worse for those subpopulations.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Example of the impact of random sampling on the representation of patient subgroups. The black dots represent unsampled patients; stars represent the sampled patients. The red box represents a subgroup that was missed by random sampling.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e72068_fig01.png"/></fig><p>In recent years, various methods have been developed to enhance the coverage of labeled data [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. However, these methods either rely on external population resources or focus solely on maximizing demographic coverage, which do not fully align with the chart review objective. Among these methods, active learning is one such approach that iteratively selects the most informative samples for labeling, aiming to optimize model performance with a minimal amount of data [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. However, active learning typically requires an initial set of labeled data to train the model and guide the selection process [<xref ref-type="bibr" rid="ref19">19</xref>]. Moreover, active learning methods are typically focused on identifying the samples that will provide the most leverage on the final model, as opposed to the ones that would best capture the diversity of the patient cohort [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>In this paper, we propose a process for selecting medical charts for review to generate gold standard labeling when constructing CPs. The goal of this method is to ensure that our selection captures the diversity of the full patient cohort. To achieve this, we propose a clustering-based process to generate potential samples. We then introduce a novel metric to identify the most representative sample that should be used for label generation. By enhancing the information coverage of the training sample, our approach is expected to yield a better performing CP for both subgroups and the full patient cohort. To illustrate this approach, we use simulation methods coupled with a real-world data example to demonstrate how this novel sampling approach can match or even surpass the performance of random sampling.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Sampling Approach&#x2003;</title><p>The overall methodological approach is illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>. We start by considering a study cohort for whom we want to create labels for the presence or absence of a condition of interest (eg, diabetes, cause-specific admission). We presume that our study cohort is large enough that we do not want to review and label all patient charts. Instead, we want to generate an optimal chart review sample<italic>,</italic> from which we will develop or &#x201C;learn&#x201D; a CP. In this paper, our analytic task is to determine how to best identify that sample. We propose that the best sample is one that maximizes coverage of the cohort, providing information about all of the subgroups that compose the cohort (<xref ref-type="fig" rid="figure2">Figure 2A</xref>). In other words, the sample should be equally representative of each subgroup, rather than merely reflecting the source population distribution. To assess coverage, we define a novel metric, described below. A variety of methods can be used to generate the sample pool. In this study, we propose using a stratified sampling framework, in other words, clustering the data and then sampling from these clusters (<xref ref-type="fig" rid="figure2">Figure 2B</xref>). By identifying and then sampling from clusters, we hypothesize that we will be able to represent different patient subgroups, making the chart review sample more reflective of the entire patient cohort.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Diagram illustrating our sampling approach. (A) procedures for general coverage sampling; (B) procedures for generating the sample pool.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e72068_fig02.png"/></fig><sec id="s2-1-1"><title>Optimal Sample Generation</title><p>After defining a cohort of interest, we start by clustering the individual patient records. As illustrated in our real data example, our groupings are driven by clinical factors, so we only use clinical features (ie, not demographic factors) to conduct the clustering. We then sample records randomly from each of the clusters. For example, if we prespecify that we want to review 100 charts, and we generate 4 clusters, we would sample 25 records from each cluster. While a variety of clustering algorithms can be used, we suggest hierarchical clustering. The nested cluster structure provided by hierarchical clustering is reflective of our proposed interpretation that the cohort consists of patient subgroups. For comparison, we also present results from K-means clustering. Notably, with a sufficiently large number of replications, we expect different clustering methods to generate similar optimal samples, resulting in comparable representative samples.</p></sec><sec id="s2-1-2"><title>Coverage Assessment</title><p>A primary step is assessing data coverage. To do so, we propose a novel metric that measures the coverage of the sample for the full data cohort. We define the n<sup>th</sup> nearest neighbor distance as:</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msup><mml:mi>n</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup><mml:mtext>&#x00A0;</mml:mtext><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>For each person, <italic>i</italic>, in the study cohort of size N, we calculate the Euclidean distance, <italic>d</italic>, to each person from the sampled set. <inline-formula><mml:math id="ieqn1"><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the distance between the <inline-formula><mml:math id="ieqn2"><mml:msup><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> person in the cohort, to the <inline-formula><mml:math id="ieqn3"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest sampled person. For example, the 5th nearest neighbor distance refers to the sum of the distance between every patient <italic>i</italic> and its 5th closest sampled person. After generating the sample pool, we calculate the distance for each individual in the patient cohort to the <inline-formula><mml:math id="ieqn4"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest sampled person. We choose the chart review sample with the lowest <inline-formula><mml:math id="ieqn5"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance. The intuition for the <inline-formula><mml:math id="ieqn6"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance is to ensure that for each person in the full cohort, there is someone in the chart review sample that is &#x201C;near&#x201D; or representative of them. This should result in greater coverage for underrepresented subgroups and phenotypes compared to random sampling. For example, for a disease that can present clinically in a variety of ways (eg, diabetes), if a rarer presentation is not represented in the chart sample, then for a person <inline-formula><mml:math id="ieqn7"><mml:mi>i</mml:mi></mml:math></inline-formula> from this minority group, the closest sampled individuals will be in other subgroups, resulting in a larger <inline-formula><mml:math id="ieqn8"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance. Moreover, sampling to minimize the <inline-formula><mml:math id="ieqn9"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance will not adversely impact the majority presentation since group members will still have representative samples.</p><p>Our coverage sampling process can be summarized as follows:</p><list list-type="order"><list-item><p>Cluster the dataset based on clinical factors across a range of k clusters.</p></list-item><list-item><p>Conduct stratified sampling with a specified sample size across the clusters multiple times to generate the sample pool.</p></list-item><list-item><p>Calculate the <inline-formula><mml:math id="ieqn10"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance for each sample set in the sample pool and identify the sample with the minimal <inline-formula><mml:math id="ieqn11"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance.</p></list-item></list><p>The primary tuning parameter is n. This can be prespecified by the user, or <italic>n</italic> can be assessed over a range of values, and taking the mean distance. As we show below, the approach is not very sensitive to the choice of n. Although the results presented are based on Euclidean distance, given the mixed data types of the EHR data, we also assessed Manhattan and Gower distances and found that they had minimal impact on the final results.</p></sec><sec id="s2-1-3"><title>Assumptions</title><p>The primary assumption of this procedure is that we have a broad cohort from which to sample that fully captures all individuals with the condition from which we wish to define a CP. Meaning, our identified patient cohort (ie, our denominator) has perfect sensitivity for the outcome of interest, and the analytic challenge is improving the specificity of the CP.</p></sec><sec id="s2-1-4"><title>Evaluation Criteria</title><p>As shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, we assess the quality of a selected sample for chart review in 2 ways: cohort coverage and CP performance. For cohort coverage, we compare the <inline-formula><mml:math id="ieqn12"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance, with samples exhibiting smaller distances considered more representative of the study cohort. For CP performance, we train a classification model using a sample derived from either our proposed coverage sampling or random sampling methods. All the unsampled patients are regarded as the test dataset. We evaluate the efficacy of these models by comparing the area under the receiver operating characteristic curve (AUROC) using the test dataset. Samples that yield models with higher AUROC values are considered to be better.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Diagram to show the evaluation criteria of sample quality. The coverage sampling refers to the sampling procedure outlined in <xref ref-type="fig" rid="figure2">Figure 2</xref>. AUROC: area under the receiver operating characteristic curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e72068_fig03.png"/></fig></sec></sec><sec id="s2-2"><title>Simulation Study</title><p>We conduct a simulation study to evaluate the efficacy of coverage sampling outlined above. We sample 120 patients from 250 simulated datasets with a size of 10,000 and 10 characteristic variables by both random sampling and coverage sampling. Across the datasets, we generate 4 clusters and create a sample with different proportions of each cluster: (simulation set 0: 1.0,0,0,0; simulation set 1: 0.25, 0.25, 0.25, 0.25; simulation set 2: 0.1, 0.3, 0.3, 0.3; simulation set 3: 0.1, 0.1, 0.4, 0.4 and simulation set 4: 0.1, 0.1, 0.1, 0.7). These samples can be interpreted as: no underlying cluster structure, equally distributed subgroups, 1 minority subgroup, 2 minority subgroups, and one majority group. The initial 2 simulation sets (sets 0&#x2010;1) serve as baselines, where either no underlying cluster exists or all clusters are of equal size. The subsequent simulations (sets 2&#x2010;4) delve into more complex scenarios, incorporating minority subgroups to assess their impact on the representation of subgroups within the samples.</p><p>To generate the clustered data, we used the R package fungible [<xref ref-type="bibr" rid="ref13">13</xref>], which uses the following model:</p><disp-formula id="equWL2"><mml:math id="eqn2"><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mi>B</mml:mi><mml:mo>+</mml:mo><mml:mi>e</mml:mi></mml:math></disp-formula><p>Where <inline-formula><mml:math id="ieqn13"><mml:mi>X</mml:mi></mml:math></inline-formula> is the matrix of simulated observations, where each row representing an observation and each column representing a variable; <inline-formula><mml:math id="ieqn14"><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is a matrix of indicator for cluster <inline-formula><mml:math id="ieqn15"><mml:mi>j</mml:mi></mml:math></inline-formula>, identifying the membership of observation within this cluster; <inline-formula><mml:math id="ieqn16"><mml:mi>B</mml:mi></mml:math></inline-formula> is a matrix that represents the correlation between the cluster membership and observation scores), and <inline-formula><mml:math id="ieqn17"><mml:mi>e</mml:mi></mml:math></inline-formula> represents the deviations that generated from a mixture distribution.</p><p>To generate outcomes (ie, phenotypes to be derived), we apply the following model:</p><disp-formula id="equWL3"><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>e</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover><mml:mi>I</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Where <inline-formula><mml:math id="ieqn18"><mml:mi>P</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>e</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mfenced></mml:math></inline-formula> represents the probability of <inline-formula><mml:math id="ieqn19"><mml:msup><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> encounter outcome equal to 1. <inline-formula><mml:math id="ieqn20"><mml:mi>I</mml:mi><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>j</mml:mi><mml:mo>)</mml:mo></mml:math></inline-formula> is the indicator of whether the <inline-formula><mml:math id="ieqn21"><mml:msup><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> encounter belongs to <inline-formula><mml:math id="ieqn22"><mml:msup><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> cluster or not. <italic>X</italic> is the design matrix where each row represents one encounter, and each column represents one explanatory variable. <inline-formula><mml:math id="ieqn23"><mml:msub><mml:mrow><mml:mi>&#x03B1;</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math id="ieqn24"><mml:msub><mml:mrow><mml:mi>&#x03B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mi> </mml:mi></mml:math></inline-formula>, and  <inline-formula><mml:math id="ieqn25"><mml:msub><mml:mrow><mml:mi>&#x03B2;</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are the intercept and main effects corresponding to <inline-formula><mml:math id="ieqn26"><mml:mi>I</mml:mi><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>j</mml:mi><mml:mo>)</mml:mo></mml:math></inline-formula> and <italic>X</italic>. To more accurately reflect real-world conditions, only half of the explanatory variables were incorporated into the generation of the outcome variables, treating the remaining variables as noise (with respect to the outcome).</p><p>For each simulated dataset, we assessed information coverage and model performance across 3 samples. First, using the procedure described in the optimal sample generation section, we averaged the 1st to 10th nearest neighbor distances to obtain a hierarchical-cluster-based sample of size 120 and a k-means-cluster-based sample of size 120. For comparison, we selected 120 random samples and 120 that were sampled from the true underlying clusters. In this manuscript, we refer to the 4 samples as hierarchical, k-means, random, and truth. All data not included in these samples were retained as test data for further analysis. For the model performance comparison, we used each of the 4 derived samples (hierarchical cluster coverage, k-means cluster coverage, random, truth) to fit a logistic regression model to learn a probabilistic CP. For the scenario without an underlying cluster structure, only hierarchical cluster coverage, k-means coverage, and random sampling results are presented, as there is no true cluster structure to compare against. We computed the AUROC to evaluate the model&#x2019;s performance and averaged the performance over 50 iterations.</p></sec><sec id="s2-3"><title>Real-World Data Application</title><p>Our application is motivated by our previous work to develop a CP for a hospital admission due to COVID-19. During the height of the COVID-19 pandemic, hospitals tested all patients for SARS-CoV-2. Work by us [<xref ref-type="bibr" rid="ref14">14</xref>] and others [<xref ref-type="bibr" rid="ref15">15</xref>] has indicated that up to 38% of patients who tested positive for SARS-CoV-2 upon admission were admitted for reasons other than COVID-19. Therefore, a CP for admission due to COVID-19 would need to be more complex than simply a positive SARS-CoV-2 test. Our goal then is to define a sample of patients for chart review, in aid of learning a CP for admission due to COVID-19. Since COVID-19 patients could have different presentations, we hypothesize that our coverage sampling approach would be better for learning a CP.</p><sec id="s2-3-1"><title>Data Source</title><p>We abstracted data from the Duke University Health System EHR system. Duke University Health System consists of 3 hospitals on a common, EPIC-based EHR system. The clinical data are organized into a research-ready datamart, based on the PCORnet Common Data Model [<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s2-3-2"><title>Source Cohort</title><p>Our study cohort consisted of all patients with an inpatient admission and a positive test for SARS-CoV-2 from March 2020 to March 2023 (when routine testing stopped). This definition has perfect sensitivity, but poor specificity, for capturing admissions due to COVID-19. Following our previous work, we split this cohort into training and testing data. The testing data consisted of 441 patients admitted from January 16 to 22, 2022 and were already chart reviewed for operational purposes. Additional information regarding the testing data can be found in [<xref ref-type="bibr" rid="ref14">14</xref>]. The training data consisted of the other 7743 unlabeled patients with positive SARS-CoV-2 tests from 2020&#x2010;2023.</p></sec><sec id="s2-3-3"><title>Features Used</title><p>For coverage sampling and CP generation, we used 46 clinically relevant features such as encounter characteristics (encounter type, admitting source, and discharge disposition), diagnoses, laboratory tests conducted, and medications administered. Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides full details on features used. While we extracted demographic characteristics, we did not include these in the sampling or CP development steps.</p></sec><sec id="s2-3-4"><title>Sampling and Outcome Labeling</title><p>We generated 2 samples of 100 using coverage and random sampling from the training dataset of 7743 patients. For the coverage sampling, we used hierarchical clustering and identified the cluster structure that minimized the <italic>1st</italic> nearest neighbor distance. An infectious disease specialist (JHH) chart reviewed and labeled the encounter as due to COVID-19 or related sequela, or not.</p></sec><sec id="s2-3-5"><title>Method Evaluation</title><p>We compare the patient characteristics for the samples that were selected from each sampling approach. Then, using the criteria defined above, we evaluate the coverage of the sample of the full cohort. Finally, we used each sample to learn a probabilistic CP based on a least absolute shrinkage and selection operator logistic regression. We evaluated each version on the independent test data.</p><p>All analyses were conducted in R version 4.3.2. The source code used in these experiments is available at GitHub [<xref ref-type="bibr" rid="ref22">22</xref>].</p></sec></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study was approved and declared exempt by the Duke School of Medicine IRB, protocol Pro00109397 (9/14/2021). We used a limited analytical dataset within a secure computing environment, and patients did not receive any compensation.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Evaluation of Sampling Methods Using Simulated Data</title><p><xref ref-type="fig" rid="figure4">Figure 4</xref> presents the mean 1st to 10th nearest neighbor distances for both random and coverage sampling methods across 4 distinct scenarios. <xref ref-type="fig" rid="figure4">Figure 4A</xref> illustrates a baseline scenario where all clusters are of equal size, while <xref ref-type="fig" rid="figure4">Figure 4B-D</xref> depict scenarios with 1, 2, and 3 minority subgroups, respectively. In each scenario, the coverage samples consistently exhibit smaller nearest neighbor distances compared to those from random samples. As minority subgroups are incorporated into the simulated cohort, the advantage of the coverage sample over a random sample increases. Notably, the distances in the hierarchical and k-means-clustered coverage samples are closely aligned with those observed in true cluster configurations, indicating similar coverage of the cohort.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>From 1st to 10th nearest neighbor distance mean across 200 simulated data samples for 4 cluster ratios. The red line represents the random sample; the blue line represents the coverage sampling based on hierarchical cluster; the orange line represents the coverage sampling based on k-means clustering; the green line represents the coverage sampling based on the true cluster. (A) All simulated data follows the baseline cluster ratio (0.25,0.25,0.25,0.25). (B) All simulated data follows cluster ratio (0.1,0.3,0.3,0.3). (C) All simulated data follows cluster ratio (0.2,0.2,0.4,0.4). (D) All simulated data follows cluster ratio (0.1,0.1,0.1,0.7).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e72068_fig04.png"/></fig><p>After generating the samples, we used the data to learn a probabilistic CP and tested its performance. <xref ref-type="table" rid="table1">Table 1</xref> presents the mean 1st to 10th nearest neighbor distance and the mean logistic model&#x2019;s AUROC between a random sample and coverage samples generated using hierarchical clustering, k-means clustering, and a true cluster structure of size 120 across 250 simulated datasets. Values highlighted in asterisk (<xref ref-type="table" rid="table1">Table 1</xref>) indicate AUROCs that are significantly higher than those of the random sample at the 0.05 significance level. The visualization of the AUROC results is shown in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Consistent with coverage results, in the baseline scenario without any minority subgroup or any underlying cluster structure, coverage samples exhibit similar AUROC compared to random samples. However, with the introduction of a minority subgroup, the coverage samples based on hierarchical, k-means, and true cluster structures all produced significantly higher AUROC values compared to random samples. Additionally, we observed that the coverage samples using hierarchical clustering and k-means clustering exhibited similar performance, suggesting that the choice of clustering method has minimal impacts on coverage sampling, provided there are sufficient repetitions.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of mean distance and area under the curve between random sample, sample based on coverage samples based on hierarchical clustering, k-means clustering, and true cluster.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cluster ratio and sample type</td><td align="left" valign="bottom">Mean of 1st-10th nearest neighbor Distance</td><td align="left" valign="bottom">Overall AUROC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (95% CI)</td><td align="left" valign="bottom">Subgroup 1 AUROC (95% CI)</td><td align="left" valign="bottom">Subgroup 2 AUROC (95% CI)</td><td align="left" valign="bottom">Subgroup 3 AUROC (95% CI)</td><td align="left" valign="bottom">Subgroup 4 AUROC (95% CI)</td></tr></thead><tbody><tr><td align="char" char="." valign="top" colspan="7">(1.0,0,0,0)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Random</td><td align="left" valign="top">3877.728</td><td align="left" valign="top">0.806 (0.803-0.809)</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;Hierarchical</td><td align="left" valign="top">3802.404</td><td align="left" valign="top">0.805 (0.801-0.808)</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;K-means</td><td align="left" valign="top">3823.452</td><td align="left" valign="top">0.805 (0.802-0.809)</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="char" char="." valign="top" colspan="7">(0.25,0.25,0.25,0.25)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Random</td><td align="left" valign="top">3257.498</td><td align="left" valign="top">0.751 (0.747-0.755)</td><td align="left" valign="top">0.735 (0.732-0.738)</td><td align="left" valign="top">0.739 (0.735-0.743)</td><td align="left" valign="top">0.731 (0.728-0.735)</td><td align="left" valign="top">0.733 (0.730-0.736)</td></tr><tr><td align="left" valign="top">&#x2003;Hierarchical</td><td align="left" valign="top">3170.708</td><td align="left" valign="top">0.751 (0.747-0.756)</td><td align="left" valign="top">0.734 (0.731-0.738)</td><td align="left" valign="top">0.739 (0.735-0.742)</td><td align="left" valign="top">0.731 (0.727-0.734)</td><td align="left" valign="top">0.732 (0.729-0.735)</td></tr><tr><td align="left" valign="top">&#x2003;K-means</td><td align="left" valign="top">3163.75</td><td align="left" valign="top">0.752 (0.748-0.757)</td><td align="left" valign="top">0.735 (0.732-0.738)</td><td align="left" valign="top">0.738 (0.734-0.742)</td><td align="left" valign="top">0.731 (0.727-0.734)</td><td align="left" valign="top">0.732 (0.729-0.735)</td></tr><tr><td align="left" valign="top">&#x2003;Truth</td><td align="left" valign="top">3173.694</td><td align="left" valign="top">0.748 (0.744-0.753)</td><td align="left" valign="top">0.731 (0.727-0.735)</td><td align="left" valign="top">0.735 (0.731-0.738)</td><td align="left" valign="top">0.727 (0.724-0.731)</td><td align="left" valign="top">0.729 (0.725-0.732)</td></tr><tr><td align="char" char="." valign="top" colspan="7">(0.1,0.3,0.3,0.3)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Random</td><td align="left" valign="top">2293.954</td><td align="left" valign="top">0.691 (0.678-0.703)</td><td align="left" valign="top">0.706 (0.694-0.717)</td><td align="left" valign="top">0.607 (0.596-0.618)</td><td align="left" valign="top">0.609 (0.599, 0.618)</td><td align="left" valign="top">0.601 (0.593-0.610)</td></tr><tr><td align="left" valign="top">&#x2003;Hierarchical</td><td align="left" valign="top">2192.928</td><td align="left" valign="top">0.742<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.731-0.753)</td><td align="left" valign="top">0.750<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.741-0.760)</td><td align="left" valign="top">0.633<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.623-0.643)</td><td align="left" valign="top">0.638<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.630-0.647)</td><td align="left" valign="top">0.629<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.620-0.638)</td></tr><tr><td align="left" valign="top">&#x2003;K-means</td><td align="left" valign="top">2183.826</td><td align="left" valign="top">0.740<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.729-0.752)</td><td align="left" valign="top">0.746<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.737-0.755)</td><td align="left" valign="top">0.632<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.623-0.642)</td><td align="left" valign="top">0.636<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.628-0.644)</td><td align="left" valign="top">0.628<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.620-0.636)</td></tr><tr><td align="left" valign="top">&#x2003;Truth</td><td align="left" valign="top">2197.208</td><td align="left" valign="top">0.739<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.727-0.751)</td><td align="left" valign="top">0.747<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.736-0.757)</td><td align="left" valign="top">0.631<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.621-0.642)</td><td align="left" valign="top">0.634 (0.625-0.643<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>)</td><td align="left" valign="top">0.622<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.613-0.632)</td></tr><tr><td align="char" char="." valign="top" colspan="7">(0.1,0.1,0.4,0.4)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Random</td><td align="left" valign="top">2478.755</td><td align="left" valign="top">0.747 (0.737-0.757)</td><td align="left" valign="top">0.746 (0.737-0.756)</td><td align="left" valign="top">0.743 (0.734-0.753)</td><td align="left" valign="top">0.619 (0.612-0.626)</td><td align="left" valign="top">0.617 (0.609-0.626)</td></tr><tr><td align="left" valign="top">&#x2003;Hierarchical</td><td align="left" valign="top">2286.317</td><td align="left" valign="top">0.778<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.769-0.788)</td><td align="left" valign="top">0.774<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.767-0.781)</td><td align="left" valign="top">0.769<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.762-0.776)</td><td align="left" valign="top">0.636<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.630-0.641)</td><td align="left" valign="top">0.635<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.628-0.642)</td></tr><tr><td align="left" valign="top">&#x2003;K-means</td><td align="left" valign="top">2276.429</td><td align="left" valign="top">0.775<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.764-0.786)</td><td align="left" valign="top">0.774<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.767-0.781)</td><td align="left" valign="top">0.771<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.764-0.778)</td><td align="left" valign="top">0.633<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.628-0.639)</td><td align="left" valign="top">0.634<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.627-0.640)</td></tr><tr><td align="left" valign="top">&#x2003;Truth</td><td align="left" valign="top">2292.119</td><td align="left" valign="top">0.782<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.773-0.791)</td><td align="left" valign="top">0.778<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.772-0.784)</td><td align="left" valign="top">0.773<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.767-0.779)</td><td align="left" valign="top">0.639<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.633-0.644)</td><td align="left" valign="top">0.637<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.630-0.644)</td></tr><tr><td align="char" char="." valign="top" colspan="7">(0.1,0.1,0.1,0.7)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Random</td><td align="left" valign="top">2584.656</td><td align="left" valign="top">0.731 (0.718-0.745)</td><td align="left" valign="top">0.740 (0.730-0.750)</td><td align="left" valign="top">0.743 (0.735-0.752)</td><td align="left" valign="top">0.739 (0.730-0.747)</td><td align="left" valign="top">0.586 (0.580-0.592)</td></tr><tr><td align="left" valign="top">&#x2003;Hierarchical</td><td align="left" valign="top">2287.864</td><td align="left" valign="top">0.769<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.756-0.782)</td><td align="left" valign="top">0.776<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.771-0.782)</td><td align="left" valign="top">0.776<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.770-0.781)</td><td align="left" valign="top">0.772<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.767-0.777)</td><td align="left" valign="top">0.601<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.595-0.607)</td></tr><tr><td align="left" valign="top">&#x2003;K-means</td><td align="left" valign="top">2276.252</td><td align="left" valign="top">0.775<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.762-0.788)</td><td align="left" valign="top">0.777<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.771-0.783)</td><td align="left" valign="top">0.774<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.769-0.780)</td><td align="left" valign="top">0.771<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.765-0.776)</td><td align="left" valign="top">0.600<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.594-0.606)</td></tr><tr><td align="left" valign="top">&#x2003;Truth</td><td align="left" valign="top">2290.974</td><td align="left" valign="top">0.772<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.759-0.785)</td><td align="left" valign="top">0.780<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.774-0.786)</td><td align="left" valign="top">0.778<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.773-0.783)</td><td align="left" valign="top">0.774<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.769-0.780)</td><td align="left" valign="top">0.601<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (0.596-0.607)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table1fn2"><p><sup>b</sup>N/A: not applicable.</p></fn><fn id="table1fn3"><p><sup>c</sup>indicates that the AUROC is significantly higher than that of random sampling method at the .05 significance level.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Evaluation of Sampling Methods Using Real-World Data</title><p>In real-world data, the true number of clusters or patient subgroups is unknown. We therefore explored a range of potential cluster structures, including 2, 3, 4, 5, 10, 15, and 20 cluster structures. Based on simulated data results, with enough replications, the choice of clustering method does not impact our sampling approach; thus, we only evaluated hierarchical clustering in the real-world data analysis. <xref ref-type="fig" rid="figure5">Figure 5</xref> shows the difference in the mean 1st to 20th nearest neighbor distances in samples generated using the coverage and random samples (ie, coverage sample <inline-formula><mml:math id="ieqn27"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance subtracted from random sample <inline-formula><mml:math id="ieqn28"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>nearest neighbor distance). The results demonstrate that for smaller sample sizes (50 and 100), samples drawn from structures with 2, 3, and 4 clusters provide a more accurate representation than their random counterparts. However, as the chart review sample size increases from 400 to 800, the <inline-formula><mml:math id="ieqn29"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance of the coverage samples aligns more closely with that of random samples. It is noteworthy that samples derived from 10, 15, and 20 cluster structures perform less effectively across all sample sizes.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Mean n<sup>th</sup> nearest neighbor distance difference (random sample distance&#x2013;coverage sample distance) over 100 replications for real-world data.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e72068_fig05.png"/></fig><p>We conducted a chart review of the 100 random samples and 100 coverage samples. Our coverage sampling approach selected a sample based on 1st nearest neighbor distance. Assessing the clusters using nearest neighbor distance indicated that 2 clusters were the optimal cluster structure. Major clinical differences between the 2 clusters included the C-Reactive protein test, D-dimer test, and BMI (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Moreover, while clustering was performed using only clinical variables, the resulting clusters also exhibited meaningful demographic differences, with cluster 1 consisting of an older population compared to cluster 2 (50% vs 25% individuals older than 65 y old). Additional details regarding the cluster characteristics are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. <xref ref-type="table" rid="table2">Table 2</xref> presents the demographic characteristics of the full 7743 patient cohort, as well as the demographics of the coverage and random samples. The standardized mean differences of the random sample and coverage sample are also shown. Notably, the coverage sample includes a higher percentage of young adults (24/100) compared to the random sample (11/100), with other demographic variables showing similar prevalence patterns in both samples.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Demographic characteristics of real-world data, full sample, coverage sample, and random sample.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Full sample</td><td align="left" valign="bottom">Test dataset</td><td align="left" valign="bottom">Coverage sample</td><td align="left" valign="bottom">Random sample</td><td align="left" valign="bottom">Random versus cluster SMD<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Sample size</td><td align="left" valign="top">7743</td><td align="left" valign="top">441</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Male sex, n (%)</td><td align="left" valign="top">3737 (48.3)</td><td align="left" valign="top">221 (50.1)</td><td align="left" valign="top">43 (43.0)</td><td align="left" valign="top">47 (47.0)</td><td align="left" valign="top">0.080</td></tr><tr><td align="left" valign="top">Age (years), n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">0.391</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Children (0&#x2010;18)</td><td align="left" valign="top">307 (4.0)</td><td align="left" valign="top">11 (2.5)</td><td align="left" valign="top">3 (3.0)</td><td align="left" valign="top">2 (2.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Young adult (18-35)</td><td align="left" valign="top">995 (12.9)</td><td align="left" valign="top">63 (14.3)</td><td align="left" valign="top">24 (24.0)</td><td align="left" valign="top">11 (11.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Middle adult (35-65)</td><td align="left" valign="top">3015 (38.9)</td><td align="left" valign="top">177 (40.1)</td><td align="left" valign="top">38 (38.0)</td><td align="left" valign="top">38 (38.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Older adult (&#x003E;65)</td><td align="left" valign="top">3426 (44.2)</td><td align="left" valign="top">190 (43.1)</td><td align="left" valign="top">35 (35.0)</td><td align="left" valign="top">49 (49.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Race and Ethnicity, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">0.131</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hispanic</td><td align="left" valign="top">833 (10.8)</td><td align="left" valign="top">30 (6.8)</td><td align="left" valign="top">13 (13.0)</td><td align="left" valign="top">15 (15.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Non-Hispanic Black</td><td align="left" valign="top">2999 (38.7)</td><td align="left" valign="top">211 (47.8)</td><td align="left" valign="top">44 (44.0)</td><td align="left" valign="top">39 (39.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Non-Hispanic white</td><td align="left" valign="top">3572 (46.1)</td><td align="left" valign="top">187 (42.4)</td><td align="left" valign="top">36 (36.0)</td><td align="left" valign="top">40 (40.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Non-Hispanic Asian</td><td align="left" valign="top">110 (1.4)</td><td align="left" valign="top">1 (0.2)</td><td align="left" valign="top">3 (3.0)</td><td align="left" valign="top">2 (2.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other races</td><td align="left" valign="top">229 (3.0)</td><td align="left" valign="top">12 (2.7)</td><td align="left" valign="top">4 (4.0)</td><td align="left" valign="top">4 (4.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5">Group primary payment</td><td align="left" valign="top">0.239</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Private</td><td align="left" valign="top">3564 (46.0)</td><td align="left" valign="top">216 (49.0)</td><td align="left" valign="top">48 (48.0)</td><td align="left" valign="top">50 (50.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Public</td><td align="left" valign="top">3215 (41.5)</td><td align="left" valign="top">176 (39.9)</td><td align="left" valign="top">42 (42.0)</td><td align="left" valign="top">35 (34.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-pay</td><td align="left" valign="top">307 (4.0)</td><td align="left" valign="top">22 (5.0)</td><td align="left" valign="top">4 (4.0)</td><td align="left" valign="top">3 (3.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Others</td><td align="left" valign="top">657 (8.5)</td><td align="left" valign="top">27 (6.1)</td><td align="left" valign="top">6 (6.0)</td><td align="left" valign="top">12 (12.0)</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>SMD: standardized mean differences.</p></fn></table-wrap-foot></table-wrap><p>After generating labels, we fit a least absolute shrinkage and selection operator logistic regression model to learn and test a CP. <xref ref-type="table" rid="table3">Table 3</xref> presents the <italic>1st</italic> nearest neighbor distance for both the coverage sample and random sample, as well as the AUROC for the learned CP. Additionally, we plot the 1st to 20th nearest neighbor distances of coverage and random sample in Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Given that the true cluster structure is unknown, we report the AUROC at the demographic variable level. The 1st nearest neighbor distance and the AUROC results indicate that the coverage sample slightly outperforms the random sample. This pattern is also observed at the demographic feature level; however, these differences did not reach statistical significance. Nonetheless, in terms of magnitude, coverage samples demonstrate a notable improvement in the coverage of young adults compared to random samples. Of note, we found that increased coverage of demographic groups is not directly correlated with model performance. For example, the coverage and random samples both have similar proportions of males and females. However, the coverage sampling performs nominally better within each sex group. This supports not including demographics in the clustering step and relying on clinical drivers of differentiation. We further evaluated the performance of coverage sampling using additional model architectures, including random forest and XGBoost, for real-world data. Across all models, the coverage sampling approach consistently performed as well as, or better than, random sampling. More details can be found in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>For very small subgroups, the performance of the coverage sample may vary. For instance, in the &#x201C;Other race&#x201D; group (n=12), the coverage sample shows a substantial AUROC improvement over random sampling (0.611 vs 0.925). In contrast, for the &#x201C;Other payment&#x201D; group (n=27), the coverage sample performs slightly worse than random sampling (0.805 vs 0.820).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Mean area under the curve comparison between the coverage sample and the random sample on real-world data.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics (n)</td><td align="left" valign="bottom">Random sample AUROC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> (95% CI)</td><td align="left" valign="bottom">Cluster sample AUROC (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">First nearest neighbor distance</td><td align="left" valign="top">57777.82</td><td align="left" valign="top">54213.58</td></tr><tr><td align="left" valign="top">Overall</td><td align="left" valign="top">0.726 (0.680-0.772)</td><td align="left" valign="top">0.747 (0.701-0.793)</td></tr><tr><td align="left" valign="top" colspan="3">Sex</td></tr><tr><td align="left" valign="top">&#x2003;Female (n=220)</td><td align="left" valign="top">0.724 (0.659-0.784)</td><td align="left" valign="top">0.763 (0.695-0.824)</td></tr><tr><td align="left" valign="top">&#x2003;Male (n=221)</td><td align="left" valign="top">0.725 (0.656-0.789)</td><td align="left" valign="top">0.730 (0.663-0.797)</td></tr><tr><td align="left" valign="top" colspan="3">Age (years), n (%)</td></tr><tr><td align="left" valign="top">&#x2003;Children (0&#x2010;18; n=2)</td><td align="left" valign="top">0.609 (0.312-0.875)</td><td align="left" valign="top">0.656 (0.312-0.937)</td></tr><tr><td align="left" valign="top">&#x2003;Young adult (18 &#x2013; 35; n=65)</td><td align="left" valign="top">0.789 (0.685-0.886)</td><td align="left" valign="top">0.867 (0.774-0.947)</td></tr><tr><td align="left" valign="top">&#x2003;Middle adult (35 &#x2013; 65; n=179)</td><td align="left" valign="top">0.723 (0.652-0.793)</td><td align="left" valign="top">0.727 (0.647-0.797)</td></tr><tr><td align="left" valign="top">&#x2003;Older adult (&#x003E;65; n=185)</td><td align="left" valign="top">0.669 (0.587-0.749)</td><td align="left" valign="top">0.673 (0.588-0.754)</td></tr><tr><td align="left" valign="top" colspan="3">Race and ethnicity, n (%)</td></tr><tr><td align="left" valign="top">&#x2003;Hispanic (n=30)</td><td align="left" valign="top">0.828 (0.674-0.963)</td><td align="left" valign="top">0.850 (0.692-0.973)</td></tr><tr><td align="left" valign="top">&#x2003;Non-Hispanic Black (n=211)</td><td align="left" valign="top">0.730 (0.660,0.791)</td><td align="left" valign="top">0.736 (0.664-0.802)</td></tr><tr><td align="left" valign="top">&#x2003;Non-Hispanic white (n=187)</td><td align="left" valign="top">0.703 (0.629,0.778)</td><td align="left" valign="top">0.725 (0.658-0.798)</td></tr><tr><td align="left" valign="top">&#x2003;Non-Hispanic Asian (n=1)</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;Other races (n=12)</td><td align="left" valign="top">0.611 (0.222-0.944)</td><td align="left" valign="top">0.925 (0.778-1)</td></tr><tr><td align="left" valign="top" colspan="3">Group primary payment</td></tr><tr><td align="left" valign="top">&#x2003;Private (n=216)</td><td align="left" valign="top">0.698 (0.626-0.764)</td><td align="left" valign="top">0.737 (0.667-0.805)</td></tr><tr><td align="left" valign="top">&#x2003;Public (n=176)</td><td align="left" valign="top">0.743 (0.672-0.811)</td><td align="left" valign="top">0.736 (0.661-0.809)</td></tr><tr><td align="left" valign="top">&#x2003;Self-pay (n=22)</td><td align="left" valign="top">0.642 (0.423-0.857)</td><td align="left" valign="top">0.733 (0.485-0.923)</td></tr><tr><td align="left" valign="top">&#x2003;Others (n=27)</td><td align="left" valign="top">0.820 (0.641-0.961)</td><td align="left" valign="top">0.805 (0.623-0.950)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>N/A: Not applicable.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>CPs are a key component of secondary research with EHR data [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. A required step in CP development is conducting a manual chart review to establish a set of &#x201C;gold-standard&#x201D; labels to identify patients with and without the condition or outcome of interest. This manual review can be highly time-consuming [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Little work has been conducted on how to optimally select charts for review, with investigators most often using random chart selection [<xref ref-type="bibr" rid="ref11">11</xref>]. This can lead to inefficiencies as potentially informative or edge cases can be missed. To address this concern, we have proposed a sampling strategy to select charts for review that captures the diversity of a population of interest. The key aspect of our method is identifying the optimal sample using a new metric that we have termed the <inline-formula><mml:math id="ieqn30"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance. We assessed our method using both simulated and real-world data, evaluating both the information coverage and CP performance. Our findings indicate that coverage sampling performs as well as, if not better than, random sampling. We recommend using a representative sample when developing a computable phenotype. Alternatively, if the phenotype has already been developed, it should be recalibrated using a representative sample of the population before deployment.</p><p>One of the motivations for this approach is the presumption that within any group of patients with a particular condition, there are patient subgroups that may have a different presentation of that condition. For example, while many patients with diabetes will have glycosylated hemoglobin test values &#x003E;6.5% there will be some individuals with controlled diabetes and normal glycosylated hemoglobin values; however, these patients still have diabetes [<xref ref-type="bibr" rid="ref27">27</xref>]. Such scenarios require the creation of complex CPs that can identify patients with diabetes who have a variety of disease presentations [<xref ref-type="bibr" rid="ref28">28</xref>]. If these patient subgroups are small enough, a random selection of charts may not provide sufficient coverage of these subgroups to ensure that the CP performs equitably for all patient subgroups. As our results demonstrate, coverage sampling has its greatest impact in CP performance when minority subgroups are present. However, the presence of such minority subgroups is not a requirement for the method to perform well. In scenarios without minority subgroups, our method performs comparably to random sampling, highlighting the robustness of the approach. Moreover, even when there is no underlying cluster structure at all, coverage sampling performs as well as random sampling.</p><p>A novel aspect of our approach is the development of a metric, the <inline-formula><mml:math id="ieqn31"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance, to measure the coverage of a given sample. While previous methods have used nearest neighbor distances to detect and quantify spatial randomness, they have primarily focused on analyzing spatial patterns in populations [<xref ref-type="bibr" rid="ref29">29</xref>]. Existing representativeness metrics, such as Simpson&#x2019;s Diversity Index and Shannon&#x2019;s Entropy, quantify overall variability across multiple demographic features [<xref ref-type="bibr" rid="ref30">30</xref>]. Simpson&#x2019;s Diversity Index measures the probability that 2 individuals randomly selected from a sample will belong to different categories, thereby emphasizing the dominance or evenness of group representation [<xref ref-type="bibr" rid="ref31">31</xref>]. Shannon&#x2019;s Entropy quantifies diversity by accounting for both the abundance and the evenness of the categories present, using information theory to assess the uncertainty in predicting the category of a randomly chosen individual [<xref ref-type="bibr" rid="ref32">32</xref>]. While these metrics effectively address general diversity measurement goals, they do not directly align with our specific goal of evaluating the representation and coverage of minority subgroups. The <inline-formula><mml:math id="ieqn32"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance explicitly evaluates the distance between records in the unsampled group to those in the sampled group. This targeted focus enables a more precise assessment of the extent to which minority subgroups are included in study samples, thereby avoiding underrepresentation in the set of records used for CP development. As our real data analysis results showed, we can use the <inline-formula><mml:math id="ieqn33"><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> nearest neighbor distance to choose an optimal number of cluster k from which to sample. In particular, choosing too large of k leads to suboptimal performance.</p><p>Although the construction and generation of a chart review sample has not been widely discussed in the CP literature, parallel work exists in the active learning literature. Current active learning methods can be categorized as query-acquiring (pool-based) or query-synthesizing [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. We focus on query-acquiring active learning, as query-synthesizing methods are not directly analogous to our work. Query-acquiring active learning uses various sampling strategies, including uncertainty sampling or information-theoretic measures, to identify which sampling strategies would be most impactful for continued labeling [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. Therefore, the underlying burden for query-acquiring active learning is the same as in our method: an efficient need for labeling [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Most existing methods, including uncertainty sampling or information-theoretic measures, focus on identifying the most influential records to enhance the performance of a given prediction task [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref40">40</xref>]. In contrast, our method is not based on a supervised objective. Further, our method seeks to select records that best capture diversity, rather than records that are most representative. While representativeness and diversity may be related, they are not necessarily equivalent.</p><p>To illustrate our approach, we tested our method with the real-world task of identifying hospital encounters due to COVID-19. During the height of the COVID-19 pandemic (2020&#x2010;2023), all patients admitted to our health system&#x2019;s hospitals were tested for SARS-CoV-2. As we and others have noted, approximately 38.2% of patients with a positive SARS-CoV-2 test were admitted for reasons other than COVID-19 [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Therefore, if one wanted to identify patients admitted due to COVID-19, a positive SARS-CoV-2 test would not be a sufficient CP because of its poor specificity. We compared the performance of the chart review sample based on a random selection of charts and our coverage sampling method. Overall, the coverage sample yielded a comparable performing CP. So, while coverage sampling did not yield better performance, the results conform to the simulation findings, which indicate that coverage is robust even when there is no underlying cluster structure or minority group.</p><p>While we selected charts based solely on clinical data elements, there were meaningful demographic differences between samples derived from randomly selected charts and through coverage sampling. For example, cluster 1 includes a much older patient population, with 50.2% (2959/5897) over the age of 65, compared to 25.3% (467/1846) in cluster 2. Patients in cluster 1 also have longer hospital stays (9.42 d on average) than those in cluster 2 (3.52 d). Additional details on cluster characteristics are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Consequently, the coverage sample exhibits a different distribution than the random sample. Specifically, the coverage sample included younger patients, a greater number of non-Hispanic Black patients (though fewer Hispanic patients), and more individuals with public insurance compared to the cohort derived from random sampling. This result highlights one of the key opportunities in this approach: deriving a less biased sample on which to build a CP. As others have described, one of the mechanisms of algorithmic bias is having unrepresentative samples used to develop the algorithm [<xref ref-type="bibr" rid="ref41">41</xref>]. For instance, in the context of rare diseases, the typical ratio of patients with a given rare condition to those without the condition is approximately 100:1 [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. In such scenarios, using random sampling may result in underrepresentation of minority subgroups in the chart review sample. When the review sample does not accurately reflect the patient population, the resulting CPs can produce biased results. For example, if certain demographic groups are underrepresented in the dataset, the CP may not learn to make accurate predictions for these groups, leading to disparities in performance [<xref ref-type="bibr" rid="ref44">44</xref>]. To account for this, algorithmic solutions have been proposed, including data augmentation [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>], resampling techniques [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>], and algorithmic adjustments [<xref ref-type="bibr" rid="ref49">49</xref>]. However, instead of addressing this problem algorithmically, we propose addressing it via design. As such, by clustering the data and sampling equally from the obtained clusters, we aim for the chart review sample to better represent the patient population. An advantage of our method is that it does not require the researcher to prespecify groups. Moreover, as our empirical results show, we are able to capture demographic diversity with just clinical data.</p></sec><sec id="s4-2"><title>Limitations</title><p>While our approach shows promise, there are some limitations. First, while our simulation results illustrate the potential improvement for coverage sampling, our real-world data example only showed nominal improvement. Further work should be conducted in other contexts. Second, the performance of our method is related to the quality of the cluster analysis. As others have noted, clustering methods can be highly variable [<xref ref-type="bibr" rid="ref50">50</xref>]. This variability may be more obvious in EHR data, which often experience data quality issues. Because the clustering step is a means of obtaining a representative sample, we address this by generating multiple samples from multiple cluster structures and selecting the one with the best coverage. In principle, it is possible to skip the clustering step and directly choose an optimal sample, leading to more robust results. While such an approach is worthy of further exploration, it would be more computationally expensive and would not necessarily yield meaningfully better results. Third, while our study suggests that coverage sampling is not very sensitive to the choice of <italic>n</italic>, its robustness warrants further evaluation. Future researchers are encouraged to test a small range of <italic>n</italic> values (eg, n=1, 5, 10), as different choices of <italic>n</italic> may yield different samples. Another potential limitation is that the coverage sample (intentionally) generates a sample that will likely have a different event rate than the true event rate within the full patient population. While this does not present a problem for rank-based metrics like AUROC, it may affect the calibration of other metrics, such as Kullback-Leibler divergence [<xref ref-type="bibr" rid="ref51">51</xref>]. When calibration is a priority, recalibration methods can be used [<xref ref-type="bibr" rid="ref52">52</xref>].</p></sec><sec id="s4-3"><title>Conclusions</title><p>Overall, our results show that our coverage sampling method can provide a more representative sample than random sampling, especially when the source cohort contains minority subgroups. This approach can lead to the generation of a CP that has better performance in the overall study population as well as within subgroups. While CP development is a key part of secondary research with EHR data, little work has been done on how best to derive samples for learning CPs. This work addresses this gap and seeks to spur more investigation in this area. Ultimately, this sampling method has the potential to improve future clinical research by making gold-standard chart review labeling a more efficient process.</p></sec></sec></body><back><ack><p>This work was supported by Food and Drug Administration Broad Agency Announcement (FDA BAA) 75F40121C00158 (principal investigator: BAG).</p><p>During the preparation of this work, the authors used ChatGPT in order to improve the language and readability of the manuscript. After using this tool or service, the authors reviewed and edited the content as needed and take full responsibility for the content of the publication.</p></ack><notes><sec><title>Funding</title><p>This work was supported by Food and Drug Administration Broad Agency Announcement (FDA BAA) 75F40121C00158 (principal investigator: BAG).</p></sec><sec><title>Data Availability</title><p>The code for the simulated dataset used in this study is available at GitHub [<xref ref-type="bibr" rid="ref22">22</xref>]; the real-world datasets analyzed during this study are not publicly available due to Duke University policies but are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>ZW: Conceptualization; Data Curation; Formal analysis; Methodology; Writing-original draft; Writing-review and editing.</p><p>JH: Data Curation; Writing-review and editing.</p><p>CH: Writing-review and editing.</p><p>BG: Conceptualization; Funding Acquisition; Methodology; Supervision; Writing-review and editing.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUROC</term><def><p> area under the receiver operator characteristic curve</p></def></def-item><def-item><term id="abb2">CP</term><def><p>computable phenotypes</p></def></def-item><def-item><term id="abb3">EHR</term><def><p>electronic health record</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>ECH</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>A</given-names> </name></person-group><article-title>Characterizing outpatient problem list completeness and duplications in the electronic health record</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>08</month><day>1</day><volume>27</volume><issue>8</issue><fpage>1190</fpage><lpage>1197</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa125</pub-id><pub-id pub-id-type="medline">32620950</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grauer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kneifati-Hayek</surname><given-names>J</given-names> </name><name name-style="western"><surname>Reuland</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Indication alerts to improve problem list documentation</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>04</month><day>13</day><volume>29</volume><issue>5</issue><fpage>909</fpage><lpage>917</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocab285</pub-id><pub-id pub-id-type="medline">34957491</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ghirardello</surname><given-names>S</given-names> </name><name name-style="western"><surname>Garr&#x00E8;</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Rossi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Maghnie</surname><given-names>M</given-names> </name></person-group><article-title>The diagnosis of children with central diabetes insipidus</article-title><source>J Pediatr Endocrinol Metab</source><year>2007</year><month>03</month><volume>20</volume><issue>3</issue><fpage>359</fpage><lpage>375</lpage><pub-id pub-id-type="doi">10.1515/jpem.2007.20.3.359</pub-id><pub-id pub-id-type="medline">17451074</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Grayson</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Strothman</surname><given-names>K</given-names> </name></person-group><article-title>Advances in asthma: New understandings of asthma&#x2019;s natural history, risk factors, underlying mechanisms, and clinical management</article-title><source>J Allergy Clin Immunol</source><year>2021</year><month>12</month><volume>148</volume><issue>6</issue><fpage>1430</fpage><lpage>1441</lpage><pub-id pub-id-type="doi">10.1016/j.jaci.2021.10.001</pub-id><pub-id pub-id-type="medline">34655640</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Olson</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Bielinski</surname><given-names>SJ</given-names> </name><etal/></person-group><article-title>Impact of diverse data sources on computational phenotyping</article-title><source>Front Genet</source><year>2020</year><month>06</month><volume>11</volume><pub-id pub-id-type="doi">10.3389/fgene.2020.00556</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gearing</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Mian</surname><given-names>IA</given-names> </name><name name-style="western"><surname>Barber</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ickowicz</surname><given-names>A</given-names> </name></person-group><article-title>A methodology for conducting retrospective chart review research in child and adolescent psychiatry</article-title><source>J Can Acad Child Adolesc Psychiatry</source><year>2006</year><month>08</month><volume>15</volume><issue>3</issue><fpage>126</fpage><lpage>134</lpage><pub-id pub-id-type="medline">18392182</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Panacek</surname><given-names>EA</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Panacek</surname><given-names>EA</given-names> </name></person-group><article-title>Performing chart review studies</article-title><source>Air Med J</source><year>2007</year><volume>26</volume><issue>5</issue><fpage>206</fpage><lpage>210</lpage><pub-id pub-id-type="doi">10.1016/j.amj.2007.06.007</pub-id><pub-id pub-id-type="medline">17765825</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McKenzie</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rajapakshe</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rajapakshe</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>A</given-names> </name></person-group><article-title>A semiautomated chart review for assessing the development of radiation pneumonitis using natural language processing: diagnostic accuracy and feasibility study</article-title><source>JMIR Med Inform</source><year>2021</year><month>11</month><day>12</day><volume>9</volume><issue>11</issue><fpage>e29241</fpage><pub-id pub-id-type="doi">10.2196/29241</pub-id><pub-id pub-id-type="medline">34766919</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ho</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>J</given-names> </name><name name-style="western"><surname>Steinhubl</surname><given-names>SR</given-names> </name><etal/></person-group><article-title>Limestone: high-throughput candidate phenotype generation via tensor factorization</article-title><source>J Biomed Inform</source><year>2014</year><month>12</month><volume>52</volume><fpage>199</fpage><lpage>211</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2014.07.001</pub-id><pub-id pub-id-type="medline">25038555</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carrell</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Floyd</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Gruber</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A general framework for developing computable clinical phenotype algorithms</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>08</month><day>1</day><volume>31</volume><issue>8</issue><fpage>1785</fpage><lpage>1796</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae121</pub-id><pub-id pub-id-type="medline">38748991</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vassar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Holzmann</surname><given-names>M</given-names> </name></person-group><article-title>The retrospective chart review: important methodological considerations</article-title><source>J Educ Eval Health Prof</source><year>2013</year><volume>10</volume><fpage>12</fpage><pub-id pub-id-type="doi">10.3352/jeehp.2013.10.12</pub-id><pub-id pub-id-type="medline">24324853</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Elfil</surname><given-names>M</given-names> </name><name name-style="western"><surname>Negida</surname><given-names>A</given-names> </name></person-group><article-title>Sampling methods in clinical research; an educational review</article-title><source>Emerg (Tehran)</source><year>2017</year><volume>5</volume><issue>1</issue><fpage>e52</fpage><pub-id pub-id-type="medline">28286859</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waller</surname><given-names>NG</given-names> </name><name name-style="western"><surname>Underhill</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Kaiser</surname><given-names>HA</given-names> </name></person-group><article-title>A method for generating simulated plasmodes and artificial test clusters with user-defined shape, size, and orientation</article-title><source>Multivariate Behav Res</source><year>1999</year><month>04</month><day>1</day><volume>34</volume><issue>2</issue><fpage>123</fpage><lpage>142</lpage><pub-id pub-id-type="doi">10.1207/S15327906Mb340201</pub-id><pub-id pub-id-type="medline">26753933</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Krishnan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hurst</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>Comparing natural language processing and structured medical data to develop a computable phenotype for patients hospitalized due to COVID-19: retrospective analysis</article-title><source>JMIR Med Inform</source><year>2023</year><month>08</month><day>22</day><volume>11</volume><fpage>e46267</fpage><pub-id pub-id-type="doi">10.2196/46267</pub-id><pub-id pub-id-type="medline">37621195</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klann</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Strasser</surname><given-names>ZH</given-names> </name><name name-style="western"><surname>Hutch</surname><given-names>MR</given-names> </name><etal/></person-group><article-title>Distinguishing admissions specifically for COVID-19 from incidental SARS-CoV-2 admissions: National Retrospective Electronic Health Record Study</article-title><source>J Med Internet Res</source><year>2022</year><month>05</month><day>18</day><volume>24</volume><issue>5</issue><fpage>e37931</fpage><pub-id pub-id-type="doi">10.2196/37931</pub-id><pub-id pub-id-type="medline">35476727</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hurst</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Maxson</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Permar</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Boulware</surname><given-names>LE</given-names> </name><name name-style="western"><surname>Goldstein</surname><given-names>BA</given-names> </name></person-group><article-title>Development of an electronic health records datamart to support clinical and population health research</article-title><source>J Clin Trans Sci</source><year>2021</year><volume>5</volume><issue>1</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1017/cts.2020.499</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>B</given-names> </name><name name-style="western"><surname>Linder</surname><given-names>F</given-names> </name><name name-style="western"><surname>Mebane</surname><given-names>WR</given-names> </name></person-group><article-title>Active learning approaches for labeling text: review and assessment of the performance of active learning approaches</article-title><source>Polit Anal</source><year>2020</year><month>10</month><volume>28</volume><issue>4</issue><fpage>532</fpage><lpage>551</lpage><pub-id pub-id-type="doi">10.1017/pan.2020.4</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cohn</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Ghahramani</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jordan</surname><given-names>MI</given-names> </name></person-group><article-title>Active learning with statistical models [abstract]</article-title><year>1996</year><access-date>2025-11-11</access-date><conf-name>Advances in Neural Information Processing Systems 7 (NIPS 1994)</conf-name><conf-date>Nov 28 to Dec 1, 1994</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://papers.nips.cc/paper_files/paper/1994/hash/7f975a56c761db6506eca0b37ce6ec87-Abstract.html">https://papers.nips.cc/paper_files/paper/1994/hash/7f975a56c761db6506eca0b37ce6ec87-Abstract.html</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>ZH</given-names> </name></person-group><article-title>Active learning by querying informative and representative examples</article-title><source>IEEE Trans Pattern Anal Mach Intell</source><year>2014</year><month>10</month><volume>36</volume><issue>10</issue><fpage>1936</fpage><lpage>1949</lpage><pub-id pub-id-type="doi">10.1109/TPAMI.2014.2307881</pub-id><pub-id pub-id-type="medline">26352626</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sinha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ebrahimi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Darrell</surname><given-names>T</given-names> </name></person-group><article-title>Variational adversarial active learning</article-title><year>2019</year><month>03</month><day>31</day><conf-name>2019 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name><pub-id pub-id-type="doi">10.1109/ICCV.2019.00607</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name></person-group><article-title>Adaptive active learning for image classification</article-title><year>2013</year><conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name><conf-date>Jun 23-28, 2013</conf-date><conf-loc>Portland, OR, USA</conf-loc><fpage>859</fpage><lpage>866</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2013.116</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name></person-group><source>phenotype-sampling</source><access-date>2025-07-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/justinwang97/phenotype-sampling">https://github.com/justinwang97/phenotype-sampling</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pfaff</surname><given-names>ER</given-names> </name><name name-style="western"><surname>Crosskey</surname><given-names>M</given-names> </name><name name-style="western"><surname>Morton</surname><given-names>K</given-names> </name><name name-style="western"><surname>Krishnamurthy</surname><given-names>A</given-names> </name></person-group><article-title>Clinical Annotation Research Kit (CLARK): computable phenotyping using machine learning</article-title><source>JMIR Med Inform</source><year>2020</year><month>01</month><day>24</day><volume>8</volume><issue>1</issue><fpage>e16042</fpage><pub-id pub-id-type="doi">10.2196/16042</pub-id><pub-id pub-id-type="medline">32012059</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tayefi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ngo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chomutare</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Challenges and opportunities beyond structured data in analysis of electronic health records</article-title><source>WIREs Computational Stats</source><year>2021</year><month>11</month><volume>13</volume><issue>6</issue><pub-id pub-id-type="doi">10.1002/wics.1549</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnston</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Lakzadeh</surname><given-names>P</given-names> </name><name name-style="western"><surname>Donato</surname><given-names>BMK</given-names> </name><name name-style="western"><surname>Szabo</surname><given-names>SM</given-names> </name></person-group><article-title>Methods of sample size calculation in descriptive retrospective burden of illness studies</article-title><source>BMC Med Res Methodol</source><year>2019</year><month>01</month><day>9</day><volume>19</volume><issue>1</issue><fpage>9</fpage><pub-id pub-id-type="doi">10.1186/s12874-018-0657-9</pub-id><pub-id pub-id-type="medline">30626343</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Connolly</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kirwan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Matthews</surname><given-names>A</given-names> </name></person-group><article-title>A scoping review of the methodological approaches used in retrospective chart reviews to validate adverse event rates in administrative data</article-title><source>Int J Qual Health Care</source><year>2024</year><month>05</month><day>10</day><volume>36</volume><issue>2</issue><fpage>mzae037</fpage><pub-id pub-id-type="doi">10.1093/intqhc/mzae037</pub-id><pub-id pub-id-type="medline">38662407</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Richesson</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Rusincovitch</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Wixted</surname><given-names>D</given-names> </name><etal/></person-group><article-title>A comparison of phenotype definitions for diabetes mellitus</article-title><source>J Am Med Inform Assoc</source><year>2013</year><month>12</month><volume>20</volume><issue>e2</issue><fpage>e319</fpage><lpage>26</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-001952</pub-id><pub-id pub-id-type="medline">24026307</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spratt</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Pereira</surname><given-names>K</given-names> </name><name name-style="western"><surname>Granger</surname><given-names>BB</given-names> </name><etal/></person-group><article-title>Assessing electronic health record phenotypes against gold-standard diagnostic criteria for diabetes mellitus</article-title><source>J Am Med Inform Assoc</source><year>2017</year><month>04</month><day>1</day><volume>24</volume><issue>e1</issue><fpage>e121</fpage><lpage>e128</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocw123</pub-id><pub-id pub-id-type="medline">27616701</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thompson</surname><given-names>HR</given-names> </name></person-group><article-title>Distribution of distance to Nth neighbour in a population of randomly distributed individuals</article-title><source>Ecology</source><year>1956</year><month>04</month><volume>37</volume><issue>2</issue><fpage>391</fpage><lpage>394</lpage><pub-id pub-id-type="doi">10.2307/1933159</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peet</surname><given-names>RK</given-names> </name></person-group><article-title>The measurement of species diversity</article-title><source>Annu Rev Ecol Syst</source><year>1974</year><access-date>2025-11-19</access-date><volume>5</volume><fpage>285</fpage><lpage>307</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jstor.org/stable/2096890">https://www.jstor.org/stable/2096890</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simpson</surname><given-names>EH</given-names> </name></person-group><article-title>Measurement of diversity</article-title><source>Nature New Biol</source><year>1949</year><month>04</month><day>30</day><volume>163</volume><issue>4148</issue><fpage>688</fpage><lpage>688</lpage><pub-id pub-id-type="doi">10.1038/163688a0</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shannon</surname><given-names>CE</given-names> </name></person-group><article-title>A mathematical theory of communication</article-title><source>Bell System Technical Journal</source><year>1948</year><month>07</month><volume>27</volume><issue>3</issue><fpage>379</fpage><lpage>423</lpage><pub-id pub-id-type="doi">10.1002/j.1538-7305.1948.tb01338.x</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Settles</surname><given-names>B</given-names> </name></person-group><article-title>Active learning literature survey</article-title><year>2009</year><access-date>2025-11-24</access-date><publisher-name>Minds@UW</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://minds.wisconsin.edu/handle/1793/60660">https://minds.wisconsin.edu/handle/1793/60660</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Freund</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Seung</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Shamir</surname><given-names>E</given-names> </name><name name-style="western"><surname>Tishby</surname><given-names>N</given-names> </name></person-group><article-title>Selective sampling using the query by committee algorithm</article-title><source>Mach Learn</source><year>1997</year><month>08</month><volume>28</volume><issue>2</issue><fpage>133</fpage><lpage>168</lpage><pub-id pub-id-type="doi">10.1023/A:1007330508534</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Muslea</surname><given-names>I</given-names> </name><name name-style="western"><surname>Minton</surname><given-names>S</given-names> </name><name name-style="western"><surname>Knoblock</surname><given-names>C</given-names> </name></person-group><article-title>Active semi-supervised learning = robust multi-view learning</article-title><source>ICML &#x2019;02: Proceedings of the Nineteenth International Conference on Machine Learning</source><year>2002</year><volume>2</volume><fpage>435</fpage><lpage>442</lpage><pub-id pub-id-type="doi">10.5555/645531.655845</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>McCallum</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nigam</surname><given-names>K</given-names> </name></person-group><article-title>Employing EM and pool-based active learning for text classification</article-title><year>1998</year><conf-name>Proceedings of the 15th International Conference on Machine Learning</conf-name><conf-date>Jul 24-27, 1998</conf-date><conf-loc>Madison, WI, USA</conf-loc><fpage>350</fpage><lpage>358</lpage><pub-id pub-id-type="doi">10.5555/645527.757765</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jain</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>A</given-names> </name></person-group><article-title>Active learning for large multi-class problems</article-title><year>2009</year><conf-name>2009 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops (CVPR Workshops)</conf-name><conf-date>Jun 20-25, 2009</conf-date><conf-loc>Miami, FL, USA</conf-loc><fpage>762</fpage><lpage>769</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2009.5206651</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kapoor</surname><given-names>A</given-names> </name><name name-style="western"><surname>Grauman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Urtasun</surname><given-names>R</given-names> </name><name name-style="western"><surname>Darrell</surname><given-names>T</given-names> </name></person-group><article-title>Active learning with gaussian processes for object categorization</article-title><year>2007</year><conf-name>2007 IEEE 11th International Conference on Computer Vision</conf-name><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1109/ICCV.2007.4408844</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>MacKay</surname><given-names>DJC</given-names> </name></person-group><article-title>Information-based objective functions for active data selection</article-title><source>Neural Comput</source><year>1992</year><month>07</month><volume>4</volume><issue>4</issue><fpage>590</fpage><lpage>604</lpage><pub-id pub-id-type="doi">10.1162/neco.1992.4.4.590</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tong</surname><given-names>S</given-names> </name><name name-style="western"><surname>Koller</surname><given-names>D</given-names> </name></person-group><article-title>Support vector machine active learning with applications to text classification</article-title><source>J Mach Learn Res</source><year>2002</year><month>03</month><volume>2</volume><fpage>45</fpage><lpage>66</lpage><pub-id pub-id-type="doi">10.1162/153244302760185243</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Suresh</surname><given-names>H</given-names> </name><name name-style="western"><surname>Guttag</surname><given-names>J</given-names> </name></person-group><article-title>A framework for understanding sources of harm throughout the machine learning life cycle</article-title><year>2021</year><month>10</month><day>5</day><conf-name>1st ACM Conference on Equity and Access in Algorithms, Mechanisms, and Optimization (EAAMO &#x2019;21)</conf-name><conf-date>Oct 5-9, 2021</conf-date><pub-id pub-id-type="doi">10.1145/3465416.3483305</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mealy</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Wingerchuk</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Greenberg</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Levy</surname><given-names>M</given-names> </name></person-group><article-title>Epidemiology of neuromyelitis optica in the United States: a multicenter analysis</article-title><source>Arch Neurol</source><year>2012</year><month>09</month><volume>69</volume><issue>9</issue><fpage>1176</fpage><lpage>1180</lpage><pub-id pub-id-type="doi">10.1001/archneurol.2012.314</pub-id><pub-id pub-id-type="medline">22733096</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stoller</surname><given-names>JK</given-names> </name></person-group><article-title>The challenge of rare diseases</article-title><source>Chest</source><year>2018</year><month>06</month><volume>153</volume><issue>6</issue><fpage>1309</fpage><lpage>1314</lpage><pub-id pub-id-type="doi">10.1016/j.chest.2017.12.018</pub-id><pub-id pub-id-type="medline">29325986</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Buolamwini</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gebru</surname><given-names>T</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Friedler</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Wilson</surname><given-names>C</given-names> </name></person-group><article-title>Gender shades: intersectional accuracy disparities in commercial gender classification</article-title><year>2018</year><month>05</month><access-date>2025-11-19</access-date><conf-name>Proceedings of the 1st Conference on Fairness, Accountability and Transparency</conf-name><conf-loc>New York, NY, USA</conf-loc><fpage>77</fpage><lpage>91</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v81/buolamwini18a.html">https://proceedings.mlr.press/v81/buolamwini18a.html</ext-link></comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kennedy</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dras</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gallego</surname><given-names>B</given-names> </name></person-group><article-title>Augmentation of electronic medical record data for deep learning</article-title><source>In Studies in Health Technology and Informatics</source><year>2022</year><publisher-name>IOS Press BV</publisher-name><fpage>582</fpage><lpage>586</lpage><pub-id pub-id-type="doi">10.3233/SHTI220144</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shorten</surname><given-names>C</given-names> </name><name name-style="western"><surname>Khoshgoftaar</surname><given-names>TM</given-names> </name></person-group><article-title>A survey on image data augmentation for deep learning</article-title><source>J Big Data</source><year>2019</year><month>12</month><volume>6</volume><issue>1</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1186/s40537-019-0197-0</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chawla</surname><given-names>NV</given-names> </name><name name-style="western"><surname>Bowyer</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>LO</given-names> </name><name name-style="western"><surname>Kegelmeyer</surname><given-names>WP</given-names> </name></person-group><article-title>SMOTE: synthetic minority over-sampling technique</article-title><source>jair</source><year>2002</year><volume>16</volume><fpage>321</fpage><lpage>357</lpage><pub-id pub-id-type="doi">10.1613/jair.953</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>DJ</given-names> </name></person-group><article-title>A comparison of oversampling methods for constructing a prognostic model in the patient with heart failure</article-title><conf-name>2020 International Conference on Information and Communication Technology Convergence (ICTC)</conf-name><conf-date>Oct 21-23, 2020</conf-date><pub-id pub-id-type="doi">10.1109/ICTC49870.2020.9289522</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hino</surname><given-names>H</given-names> </name></person-group><article-title>Active learning: problem settings and recent developments</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 8, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2012.04225</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jain</surname><given-names>AK</given-names> </name></person-group><article-title>Data clustering: 50 years beyond K-means</article-title><source>Pattern Recognit Lett</source><year>2010</year><month>06</month><volume>31</volume><issue>8</issue><fpage>651</fpage><lpage>666</lpage><pub-id pub-id-type="doi">10.1016/j.patrec.2009.09.011</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hershey</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Olsen</surname><given-names>PA</given-names> </name></person-group><article-title>Approximating the kullback leibler divergence between Gaussian mixture models</article-title><year>2007</year><conf-name>2007 IEEE International Conference on Acoustics, Speech and Signal Processing - ICASSP &#x2019;07</conf-name><conf-loc>Honolulu, HI, USA</conf-loc><fpage>IV</fpage><lpage>317</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2007.366913</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>King</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>L</given-names> </name></person-group><article-title>Logistic regression in rare events data</article-title><source>Polit anal</source><year>2001</year><volume>9</volume><issue>2</issue><fpage>137</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1093/oxfordjournals.pan.a004868</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplemental materials regarding study results.</p><media xlink:href="medinform_v13i1e72068_app1.docx" xlink:title="DOCX File, 407 KB"/></supplementary-material></app-group></back></article>