<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i4e35734</article-id>
      <article-id pub-id-type="pmid">35389366</article-id>
      <article-id pub-id-type="doi">10.2196/35734</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Utility Metrics for Evaluating Synthetic Health Data Generation Methods: Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Turbe</surname>
            <given-names>Hugues</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zamstein</surname>
            <given-names>Noa</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>El Emam</surname>
            <given-names>Khaled</given-names>
          </name>
          <degrees>BEng, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Epidemiology and Public Health</institution>
            <institution>University of Ottawa</institution>
            <addr-line>401 Smyth Road</addr-line>
            <addr-line>Ottawa, ON, K1H 8L1</addr-line>
            <country>Canada</country>
            <phone>1 6137975412</phone>
            <email>kelemam@ehealthinformation.ca</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3325-4149</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Mosquera</surname>
            <given-names>Lucy</given-names>
          </name>
          <degrees>BA, MSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5289-8372</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Fang</surname>
            <given-names>Xi</given-names>
          </name>
          <degrees>BA, MSc</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5571-7004</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>El-Hussuna</surname>
            <given-names>Alaa</given-names>
          </name>
          <degrees>MSc, MD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0070-8362</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Epidemiology and Public Health</institution>
        <institution>University of Ottawa</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Children's Hospital of Eastern Ontario Research Institute</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Replica Analytics Ltd</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Open Source Research Collaboration</institution>
        <addr-line>Aarlberg</addr-line>
        <country>Denmark</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Khaled El Emam <email>kelemam@ehealthinformation.ca</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>4</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>7</day>
        <month>4</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>4</issue>
      <elocation-id>e35734</elocation-id>
      <history>
        <date date-type="received">
          <day>15</day>
          <month>12</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>4</day>
          <month>1</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>27</day>
          <month>1</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>13</day>
          <month>2</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Khaled El Emam, Lucy Mosquera, Xi Fang, Alaa El-Hussuna. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 07.04.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/4/e35734" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>A regular task by developers and users of synthetic data generation (SDG) methods is to evaluate and compare the utility of these methods. Multiple utility metrics have been proposed and used to evaluate synthetic data. However, they have not been validated in general or for comparing SDG methods.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study evaluates the ability of common utility metrics to rank SDG methods according to performance on a specific analytic workload. The workload of interest is the use of synthetic data for logistic regression prediction models, which is a very frequent workload in health research.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We evaluated 6 utility metrics on 30 different health data sets and 3 different SDG methods (a Bayesian network, a Generative Adversarial Network, and sequential tree synthesis). These metrics were computed by averaging across 20 synthetic data sets from the same generative model. The metrics were then tested on their ability to rank the SDG methods based on prediction performance. Prediction performance was defined as the difference between each of the area under the receiver operating characteristic curve and area under the precision-recall curve values on synthetic data logistic regression prediction models versus real data models.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The utility metric best able to rank SDG methods was the multivariate Hellinger distance based on a Gaussian copula representation of real and synthetic joint distributions.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study has validated a generative model utility metric, the multivariate Hellinger distance, which can be used to reliably rank competing SDG methods on the same data set. The Hellinger distance metric can be used to evaluate and compare alternate SDG methods.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>synthetic data</kwd>
        <kwd>data utility</kwd>
        <kwd>data privacy</kwd>
        <kwd>generative models</kwd>
        <kwd>utility metric</kwd>
        <kwd>synthetic data generation</kwd>
        <kwd>logistic regression</kwd>
        <kwd>model validation</kwd>
        <kwd>medical informatics</kwd>
        <kwd>binary prediction model</kwd>
        <kwd>prediction model</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Interest in synthetic data generation (SDG) has recently grown. Synthetic data are deemed to have low privacy risks in practice because there is no one-to-one mapping between synthetic records and real people [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Recent evidence supports the low privacy risk claim [<xref ref-type="bibr" rid="ref9">9</xref>]. This enables synthetic data to be used and shared for secondary purposes without the need for further consent [<xref ref-type="bibr" rid="ref10">10</xref>]. In addition to meeting privacy requirements, synthetic data must also have sufficient utility. This utility can be evaluated using utility metrics. Utility metrics are important in hyperparameter tuning of the generative models during training and communicating data quality to the synthetic data users and for researchers and analysts when ranking different SDG methods to select the best one. Our focus in this paper is on the ranking of SDG methods.</p>
      <p>Utility metrics can be defined as narrow or broad [<xref ref-type="bibr" rid="ref11">11</xref>]. Narrow metrics are specific to an analysis that is performed with the synthetic data and are also sometimes referred to as workload-aware utility metrics. For example, if the objective is to build a model between a predictor and a binary outcome, controlling for multiple confounders, then the difference in accuracy of real versus synthetic model predictions on holdout data sets would be a workload-aware utility metric. There have been multiple studies evaluating narrow metrics [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. Narrow metrics represent what the data user is ultimately interested in. Data users want synthetic data sets that score highly on narrow utility metrics.</p>
      <p>Researchers and analysts need to rank SDG methods. For example, a developer of an SDG method may use an ensemble of techniques and then select the one with the highest utility as the final result, or analysts may evaluate multiple SDG methods available in the marketplace to select one for their own projects. However, all workloads are typically not known in advance. Therefore, researchers and analysts cannot evaluate the narrow utility of the SDG methods directly. Instead, they need to use broad utility metrics during the SDG construction and evaluation process. A key requirement is that broad utility metrics are predictive of narrow utility metrics for plausible analytic workloads.</p>
      <p>Some studies utilized broad metrics, for example, to compare and improve SDG methods [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. However, many of the broad utility metrics currently used have not been validated. This means that there is a dearth of evidence demonstrating that they are predictive of narrow utility metrics under realistic decision-making scenarios.</p>
      <p>The realistic decision-making scenario that we are considering here is the comparison and ranking of SDG methods. Finding the best SDG method is becoming a more common need in the literature, and we need reliable metrics to be able to draw valid conclusions from these comparisons. Furthermore, in practice, users of SDG methods need to have good metrics to select among a number of these methods that may be available to them.</p>
      <p>Utility metrics can be classified in a different way, which is relevant for our purposes. They can pertain to a specific synthetic data set or to the generative model (“data set–specific” and “model-specific” utility metrics). Because SDG is stochastic, the utility of synthetic data sets generated from the same generative model will vary each time the generative model is run, and sometimes that variation can be substantial. Data set–specific utility metrics are useful when one wants to communicate how good the particular generated data set is to a data user. However, these utility metrics are not necessarily useful, for example, for comparing different generative models because of the stochasticity. A model-specific utility metric reflects the utility of the generated synthetic data sets on average, across many data sets that are generated from the same model. Such a metric is more useful in our context, where we want to compare and rank SDG methods.</p>
      <p>Our focus in the current study is to perform a validation study of broad model-specific utility metrics for structured (tabular) health data sets. While there have been evaluations of generative model utility metrics in the past, these have focused on images rather than structured data [<xref ref-type="bibr" rid="ref20">20</xref>]. One previous more relevant evaluation considered propensity mean squared error (pMSE) [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] as a model utility metric whereby its correlation with binary prediction accuracy on synthetic data was empirically assessed [<xref ref-type="bibr" rid="ref23">23</xref>]. The authors found that when used as a broad model-specific utility metric, by averaging across multiple synthetic data sets, this metric had a moderate correlation with narrow model-specific utility metrics. However, the correlation between a broad metric and a narrow metric across many data sets for a single SDG technique does not reflect an actual decision-making scenario. In practice, we have a single data set and multiple SDG techniques. Therefore, the extent to which the results from that previous study would be informative to our scenario of interest is unclear.</p>
      <p>We build on this previous work by considering other types of broad model-specific utility metrics beyond pMSE and adjust the methodology to more closely model a practical decision-making scenario of an analyst selecting among multiple SDG methods to identify the one with higher narrow utility on logistic regression prediction tasks. This type of prediction task is used often in health research.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>The protocol for this study was approved by the CHEO Research Institute Research Ethics Board (number CHEOREB# 21/144X). Our objective was to answer the following question: Which broad model-specific utility metrics can be used to rank SDG methods in terms of the similarity of prediction performance between real and synthetic data? In the following sections we describe the methods that were followed.</p>
      <sec>
        <title>Data Sets</title>
        <p>For our analysis, we used the 30 health data sets that are summarized in Appendix S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. These data sets are available publicly or can be requested from the data custodians. Many of these data sets have been used in previous evaluations of SDG techniques [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], and therefore we can ensure some consistency across studies in this domain. These data sets also represent a heterogeneous set of clinical situations (providing care, observational studies, clinical trials, and registries), a wide range of data set sizes (87-44,842 patients), and variation in data set complexity (as measured using average variable entropy), which allow our evaluations to be more generalizable.</p>
      </sec>
      <sec>
        <title>The Broad Utility Metrics Considered</title>
        <p>Broad utility metrics compare the joint distributions of the real and synthetic data sets. Many metrics have been proposed to compare joint distributions [<xref ref-type="bibr" rid="ref24">24</xref>]. We only focus on 6 multivariate metrics that have been used in previous work to evaluate the utility of synthetic data sets.</p>
      </sec>
      <sec>
        <title>Maximum Mean Discrepancy</title>
        <p>The maximum mean discrepancy metric is one way to test whether samples are from different distributions [<xref ref-type="bibr" rid="ref25">25</xref>]. In our implementation, we used a radial basis function kernel. This metric has been applied to assess the utility of synthetic health data [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. It is also widely used in the training of deep learning models and evaluation of the quality of synthetic data. Recent work on a recurrent Generative Adversarial Network (GAN) and recurrent conditional GAN made use of maximum mean discrepancy to assess whether the time series generated by the generative model implicitly learns the distribution of the true data [<xref ref-type="bibr" rid="ref28">28</xref>]. Another study evaluated synthetic data in the smart grid context, in which a GAN is used to learn the conditional probability distribution of the significant features in the real data set and generates synthetic data based on the learnt distribution [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
      </sec>
      <sec>
        <title>Multivariate Hellinger Distance</title>
        <p>The Hellinger distance [<xref ref-type="bibr" rid="ref30">30</xref>] has been shown to behave in a consistent manner as other distribution comparison metrics, specifically in the context of evaluating disclosure control methods [<xref ref-type="bibr" rid="ref31">31</xref>], when comparing original and transformed data.</p>
        <p>The Hellinger distance can be derived from the multivariate normal Bhattacharyya distance and has the advantage that it is bound between 0 and 1 and hence is more interpretable [<xref ref-type="bibr" rid="ref32">32</xref>]. We constructed Gaussian copulas from the original and synthetic data sets [<xref ref-type="bibr" rid="ref33">33</xref>] and then computed the distance between them. The concept of comparing the distance between 2 multivariate Gaussian distributions has been used to train GAN-based SDG methods [<xref ref-type="bibr" rid="ref34">34</xref>]. Additional details on its calculation are provided in Appendix S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Wasserstein Distance</title>
        <p>The <italic>W</italic><sub>1</sub> Wasserstein distance [<xref ref-type="bibr" rid="ref35">35</xref>] is often applied to the training of GANs [<xref ref-type="bibr" rid="ref36">36</xref>]. It has resulted in a learning process that is more robust by alleviating the vanishing gradient issue and mode collapse.</p>
        <p>While GANs have been used extensively as an SDG technique, they very often still have trouble capturing the temporal dependency of the joint probability distributions caused by time-series data. The conditional sig-Wasserstein GANs proposed for time series generation is aimed at addressing this problem [<xref ref-type="bibr" rid="ref37">37</xref>]. Here, the authors combine the signature of paths, which statistically describe the stream of data, and the <italic>W</italic><sub>1</sub> distance, to capture the joint law of time series. By employing the sig-W as the discriminator, sig-Wasserstein GAN shows an ability to generate realistic multidimensional time series. Additional details on its calculation are provided in Appendix S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Cluster Analysis Measure</title>
        <p>The original cluster metric [<xref ref-type="bibr" rid="ref21">21</xref>] was first purposed as a global measure of the data utility of original data and masked data. The cluster analysis has 2 steps: first, merge the original data (O) and masked data (M); then, given a certain number of groups G, perform cluster analysis on the merged data. The measure can be calculated as:</p>
        <graphic xlink:href="medinform_v10i4e35734_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>Where, <italic>n<sub>j</sub></italic> denotes the number of observations in the jth cluster and <italic>n<sub>jo</sub></italic> denotes the number of observations in the jth cluster that are from the original data (O). The c value is defined as:</p>
        <graphic xlink:href="medinform_v10i4e35734_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>A large <italic>U<sub>c</sub></italic> value indicates the disparities of the underlying latent structure of the original and masked data. The weight <italic>w<sub>j</sub></italic> can reflect the importance of certain clusters. This cluster analysis measure is used in the evaluation of synthetic data by simply replacing the original data with real data and the masked data with synthetic data [<xref ref-type="bibr" rid="ref17">17</xref>].</p>
      </sec>
      <sec>
        <title>Distinguishability Metrics</title>
        <p>These broad metrics are based on the idea of training a binary classifier that can discriminate between a real and synthetic record [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. That ability to discriminate is converted into a score.</p>
        <p>A propensity mean square error metric has been proposed to evaluate the similarity of real and synthetic data sets [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], a perspective adopted from the propensity score matching literature [<xref ref-type="bibr" rid="ref40">40</xref>], which we will refer to as <italic>propensityMSE</italic>. To calculate the <italic>propensityMSE</italic>, a classifier is trained on a stacked data set consisting of real observations labelled 1 and synthetic observations labelled 0. The <italic>propensityMSE</italic> score is computed as the mean squared difference of the estimated probability from the average prediction where it is not possible to distinguish between the 2 data sets. If the data sets are of the same size, which is the assumption we make here, and indistinguishable, then the average estimate will be 0.5.</p>
        <p>Another related approach that has been used to evaluate the utility of synthetic data is to take a prediction perspective rather than a propensity perspective. This has been applied with “human discriminators” by asking a domain expert to manually classify sample records as real or synthetic [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]. This means that a sample of real records and a sample of synthetic records are drawn, and the 2 sets are shuffled together. Then the shuffled records are presented to clinicians who are experts in the domain, and they are asked to subjectively discriminate between the records by indicating which is real versus synthetic. High distinguishability only occurs when the human discriminator can correctly classify real and synthetic records.</p>
        <p>The use of human discriminators is not scalable and therefore we can use machine learning algorithms trained on a training data set and that make predictions on a holdout test data set. This approach mimics the subjective evaluations described above. We will refer to this metric as <italic>predictionMSE</italic>. Also note that this calculation is different from the calculation of <italic>propensityMSE</italic> where the training data set is also used to compute the probabilities. Additional details on the calculations are provided in Appendix S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Workload Aware (Narrow) Metrics</title>
        <p>To assess whether the utility metrics are useful, we evaluated whether they can accurately rank SDG methods on workload aware metrics. This section describes these workload aware metrics.</p>
        <p>We built a logistic regression (LR) model for each data set. LR is common in health research, and a recent systematic review has shown that its performance is comparable to that of machine learning models for clinical prediction workloads [<xref ref-type="bibr" rid="ref44">44</xref>]. Furthermore, an evaluation of the relative accuracy of LR models compared to that of other machine learning techniques, such as random forests and support vector machines, on synthetic versus real data sets across multiple types of SDG methods showed that LR models are only very slightly different [<xref ref-type="bibr" rid="ref23">23</xref>]. Therefore, we would expect that the results using LR would provide broadly applicable and meaningful results.</p>
        <p>We evaluated the prediction accuracy using 3-fold crossvalidation. Accuracy was measured using the area under the receiver operating characteristic curve (AUROC) [<xref ref-type="bibr" rid="ref45">45</xref>] and the area under the precision-recall curve (AUPRC) [<xref ref-type="bibr" rid="ref46">46</xref>]. For outcomes that had multiple categories, we used the average of pairwise AUROC values [<xref ref-type="bibr" rid="ref47">47</xref>]. The AUPRC values for multicategory outcomes were macroaveraged. This was performed for each real and each synthetic data set.</p>
        <p>To assess the similarity between the AUROC and AUPRC for the real and synthetic data sets, we computed the absolute difference between them. This provides a measure of how similar the real results are to the synthetic results.</p>
      </sec>
      <sec>
        <title>Evaluation Methodology</title>
        <p>For each of the 30 real data sets, we generated 20 synthetic data sets. The utility metrics and the absolute AUROC difference and absolute AUPRC difference were computed on each of the 20 synthetic data sets, and each of these was averaged. Therefore, for each of the data sets, we had 1 average utility metric value for each of the 6 utility metrics, 1 average AUROC difference value, and 1 average AUPRC difference value. These values are tabulated in Appendix S3 and S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>SDG Methods</title>
        <p>The main hypothesis that we wanted to test was whether the utility metrics can be used to rank the SDG methods by their AUROC and AUPRC differences. The SDG methods were chosen to achieve representativeness, applicability, and variation.</p>
        <list list-type="order">
          <list-item>
            <p>Representativeness. The methods should reflect those that are often used in the community of practice and by researchers.</p>
          </list-item>
          <list-item>
            <p>Applicability. The methods are those that an analyst would likely want to compare and select from to be consistent with our motivating use case.</p>
          </list-item>
          <list-item>
            <p>Variation. The utility results among the chosen SDG methods should have variation sufficient for utility metrics to detect differences.</p>
          </list-item>
        </list>
        <p>Three generative models were used: conditional GAN [<xref ref-type="bibr" rid="ref48">48</xref>], a Bayesian network [<xref ref-type="bibr" rid="ref49">49</xref>], and a sequential synthesis approach using decision trees [<xref ref-type="bibr" rid="ref19">19</xref>]. The Bayesian network implementation uses a differential privacy approach. These 3 methods were selected for the following reasons: they each represent a class of methods that is often used in the literature (eg, sequential synthesis has been used on health and social sciences data [<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref58">58</xref>], as well as Bayesian networks [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref59">59</xref>] and GANs [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref61">61</xref>]), they use very different approaches and therefore represent plausible SDG methods that an analyst would want to compare, and they are expected to exhibit large utility level variation given that different SDG methods tend to be better at modeling certain types of variables and relationships. For these 3 reasons, this set of SDG methods was suitable for this study on validating utility metrics.</p>
      </sec>
      <sec>
        <title>Individual Utility Metric Ranking</title>
        <p>We used the Page test to determine whether the utility metric prediction was correct [<xref ref-type="bibr" rid="ref62">62</xref>]. For that, we specified 3 groups for each utility metric: an “L” group where the utility metric indicates low utility (ie, has the highest value since they are all distance-type metrics), an “H” group where the utility metric indicates high utility (ie, has the lowest value), and an “M” group in the middle. This process is repeated for each utility metric. For any particular data set, the generative model with the lowest utility is put in the “L” group, the generative model with the highest utility is put in the “H” group, and the third generative model is in the “M” group. Each generative model in a group is replaced with its AUROC or AUPRC difference value, depending on which workload aware metric is under evaluation.</p>
        <p>The null hypotheses we were testing are therefore that:</p>
        <p>H0<sub>AUROC</sub>: median(AUROC_Diff<sub>L</sub>) = median(AUROC_Diff<sub>M</sub>) = median(AUROC_Diff<sub>H</sub>)</p>
        <p>H0<sub>AUPRC</sub>: median(AUPRC_Diff<sub>L</sub>) = median(AUPRC_Diff<sub>M</sub>) = median(AUPRC_Diff<sub>H</sub>)</p>
        <p>where the subscript indicates the group. Against the alternatives:</p>
        <p>H1<sub>AUROC</sub>: median(AUROC_Diff<sub>L</sub>) ≥ median(AUROC_Diff<sub>M</sub>) ≥ median(AUROC_Diff<sub>H</sub>)</p>
        <p>H1<sub>AUPRC</sub>: median(AUPRC_Diff<sub>L</sub>) ≥ median(AUPRC_Diff<sub>M</sub>) ≥ median(AUPRC_Diff<sub>H</sub>)</p>
        <p>Where at least one of the inequalities is strict. To compute the test statistic, <italic>L,</italic> the data are put in a matrix with 30 rows, one for each data set, and 3 columns, one for each group. The accuracy scores are used to assign a rank to the values in each row. Then the ranks are summed per column <italic>R<sub>j</sub></italic> where <italic>j</italic>=1…3. The <italic>L</italic> statistic is then the sum: <italic>L</italic> = <italic>R<sub>1</sub></italic> + <italic>2R<sub>2</sub></italic> + <italic>3R<sub>3</sub></italic>. The larger that value, the greater the evidence supporting the ranking conclusion.</p>
        <p>Because of the relatively small sample size, we used an exact test of statistical significance. This also does not make distributional assumptions on the data, and for the number of data sets we have, this gives us a high-powered test.</p>
        <p>If the test is significant, then the broad utility metric can be used to correctly rank SDG techniques based on their workload (narrow) metrics. Since we were comparing multiple utility metrics, a Bonferroni adjustment was made to the α level of .05 to account for multiple testing.</p>
        <p>The maximum <italic>L</italic> value can be used to identify the utility metric that is best at ranking the SDG methods by prediction accuracy difference. This is particularly useful if more than one metric is found to be statistically significant.</p>
      </sec>
      <sec>
        <title>Aggregate Ranking</title>
        <p>Because each utility metric is expected to rank the SDG methods differently, we wanted to test whether an aggregate ranking would provide a better result than any of the individual utility metric rankings. We hoped to find an “ideal” ranking that has minimal distance to each of the individual rankings on the utility metrics. This can be performed for each data set separately, and then the ideal rankings across all the data sets would be evaluated on the Page test. The result would give us the performance of the aggregate ranking, and we can contrast that with the quality of individual utility metric rankings.</p>
        <p>The distance we used is the Spearman footrule [<xref ref-type="bibr" rid="ref63">63</xref>]. With this approach, if method A has a higher ranking than method B more often than not, method A should rank higher than method B in the ideal ranking. Given the relatively small data set, full enumeration rather than an optimization algorithm was used to find the ideal ranking.</p>
        <p>Given that the <italic>prediction</italic>MSE and <italic>propensity</italic>MSE are strongly related, the former was removed so as to not give that particular ranking a higher weighting in the aggregation.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>The results of the ranking of the SDG methods are shown in <xref ref-type="table" rid="table1">Table 1</xref>. All metrics are statistically significant in that the null hypothesis of no difference was rejected. The broad utility metric rankings were close enough to the correct rank, so the relationship was quite strong.</p>
      <p>The test statistic, the <italic>L</italic> value, indicates the strength of the ordering of data. The Hellinger distance had the highest <italic>L</italic> value among all the utility metrics, suggesting that it has an advantage in ordering the SDG methods based on their narrow utility metrics.</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>
         Page test results for each of the utility metrics and prediction accuracy
        </p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="390"/>
          <col width="150"/>
          <col width="160"/>
          <col width="150"/>
          <col width="150"/>
          <thead>
            <tr valign="top">
              <td>Utility metric</td>
              <td colspan="2">AUROC<sup>a</sup> difference</td>
              <td colspan="2">AUPRC<sup>b</sup> difference</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td><italic>L</italic> value</td>
              <td><italic>P</italic> value</td>
              <td><italic>L</italic> value</td>
              <td><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Maximum mean discrepancy</td>
              <td>384</td>
              <td>.00104<sup>c</sup></td>
              <td>392</td>
              <td>&#60;.001<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>Hellinger distance<sup>d</sup></td>
              <td>398</td>
              <td>&#60;.001<sup>c</sup></td>
              <td>409</td>
              <td>&#60;.001<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>Wasserstein distance</td>
              <td>392</td>
              <td>&#60;.001<sup>c</sup></td>
              <td>403</td>
              <td>&#60;.001<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>Cluster analysis</td>
              <td>396,</td>
              <td>&#60;.001<sup>c</sup></td>
              <td>405</td>
              <td>&#60;.001<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>Propensity mean squared error</td>
              <td>390</td>
              <td>&#60;.001<sup>c</sup></td>
              <td>394</td>
              <td>&#60;.001<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>Prediction mean squared error</td>
              <td>396</td>
              <td>&#60;.001<sup>c</sup></td>
              <td>397</td>
              <td>&#60;.001<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>Aggregate<sup>d</sup></td>
              <td>400</td>
              <td>&#60;.001<sup>c</sup></td>
              <td>408</td>
              <td>&#60;.001<sup>c</sup></td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>AUPRC: area under the precision-recall curve.</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>Statistically significant at a Bonferroni adjusted α level of .05.</p>
          </fn>
          <fn id="table1fn4">
            <p><sup>d</sup>Highest metric on the test statistic.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>The boxplots in <xref rid="figure1" ref-type="fig">Figure 1</xref> descriptively show the trend for the Hellinger distance. There is a clear trend of higher utility on the narrow AUROC and AUPRC metrics as the Hellinger distances get smaller. The boxplots for the remainder of the utility metrics are included in Appendix S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, and they all show trends similar to those seen in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>The relationship between the Hellinger distance versus the AUROC and AUPRC. The 3 SDG methods were ordered based on their relative Hellinger distance values into the “H,” “M,” and “L” groups. AUROC: area under the receiver operating characteristic curve; AUPRC: area under the precision-recall curve; SDG: synthetic data generation.</p>
        </caption>
        <graphic xlink:href="medinform_v10i4e35734_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>The results for the aggregate ranking are shown in <xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure2" ref-type="fig">Figure 2</xref>. As can be seen from the <italic>L</italic> statistic and the boxplots, there is only a slight difference between using the Hellinger distance and the aggregate ranking from 5 different utility metrics. In a post-hoc analysis, we removed each of the metrics in turn in a leave-one-out fashion and recomputed the aggregate rank, but these did not produce better results than the one presented here.</p>
      <fig id="figure2" position="float">
        <label>Figure 2</label>
        <caption>
          <p>The relationship between the aggregate ranking versus the AUROC and AUPRC. AUROC: area under the receiver operating characteristic curve; AUPRC: area under the precision-recall curve.</p>
        </caption>
        <graphic xlink:href="medinform_v10i4e35734_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Summary</title>
        <p>The purpose of our study was to identify the most useful, broad generative model utility metrics. These are different from utility metrics calculated for a particular synthetic data set. Generative model utility characterizes the average utility across synthetic data sets that are produced from a generative model. Given the stochasticity of SDG, such utility metrics are more appropriate for evaluating, comparing, and selecting among SDG models on the same real data set. Single synthetic data set utility metrics, on the other hand, are useful for communicating synthetic data utility to a data user because these pertain to the particular synthetic data set that is being shared.</p>
        <p>We performed our analysis using 3 types of generative models: a conditional GAN, a Bayesian network, and sequential decision trees. These 3 cover a broad cross-section of types of techniques that are used in practice, which would enhance the applicability and generalizability of the results.</p>
        <p>In this study, we evaluated 6 different model-specific utility metrics to determine whether they can be used to rank SDG methods. This is a practical use case that reflects a decision that an analyst using SDG methods would want to make. For example, there are multiple SDG techniques that have been published in the literature, and our ranking results can help an analyst determine the one that would work best on their real data sets.</p>
        <p>We defined workload-aware utility as the ability to develop binary or multicategory prediction models that have similar prediction accuracy, measured by the AUROC and the AUPRC, between the real and synthetic data sets. The construction of binary or multicategory prediction models is an often-used analytical workload for health data sets. We used logistic regression to compute the absolute AUROC and AUPRC differences on real and synthetic data sets.</p>
        <p>Our results based on an evaluation on 30 heterogeneous health data sets indicated that all the utility metrics proposed in the literature will work well. However, the multivariate Hellinger distance computed over the Gaussian copula has a slight advantage in that it provides better utility ordering. Further examination of an aggregate ranking using multiple utility metrics showed only a negligible difference from the results of the Hellinger distance for the AUROC metric, and therefore the simplicity of a single utility metric would be preferred.</p>
        <p>Our results would allow a researcher or analyst to select the SDG method with the highest utility defined in a narrow sense. However, maximum utility does not imply that the privacy risks are acceptably low. As there is a trade-off between utility and privacy, higher utility will increase the privacy risks as well. Therefore, when evaluating SDG methods, it is important to also consider the privacy risks.</p>
        <p>Now that we have validation evidence for a broad utility metric, it can be combined with a privacy metric to provide an overall ranking of SDG methods. For example, membership disclosure metrics for generative models [<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref65">65</xref>] can be considered along with the multivariate Hellinger distance when SDG methods are ranked. Metrics combining these 2 risk and utility metrics would be a good avenue for future research.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>An analyst may need to make other kinds of decisions, such as evaluating different SDG models for the purpose of hyperparameter tuning. Our study did not evaluate that specific use case, and therefore we cannot make broader claims that the Hellinger distance metric is suitable for other use cases.</p>
        <p>Our study was performed by averaging the broad and narrow utility across 20 synthetic data sets (iterations). A larger number of iterations was evaluated (50 and 100), and we noted that the differences were not material. We opted to present the smaller number of iterations as these still give us meaningful results and would be faster computationally for others applying these results.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Detailed SDG method descriptions, dataset descriptions, and detailed analysis results. SDG: synthetic data generation.</p>
        <media xlink:href="medinform_v10i4e35734_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 484 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUPRC</term>
          <def>
            <p>area under the precision-recall curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GAN</term>
          <def>
            <p>Generative Adversarial Network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">pMSE</term>
          <def>
            <p>propensity mean squared error</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">SDG</term>
          <def>
            <p>synthetic data generation</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study uses information obtained from www.projectdatasphere.org, which is maintained by Project Data Sphere, LLC. Neither Project Data Sphere, LLC nor the owner(s) of any information from the website has contributed to or approved or is in any way responsible for the contents of this study. This research was enabled in part by support provided by Compute Ontario (computeontario.ca) and Compute Canada (<ext-link ext-link-type="uri" xlink:href="http://www.computecanada.ca" xlink:type="simple">www.computecanada.ca</ext-link>). This work was partially funded by the Canada Research Chairs program through the Canadian Institutes of Health Research, a Discovery Grant RGPIN-2016-06781 from the Natural Sciences and Engineering Research Council of Canada, through a contract with the Bill and Melinda Gates Foundation, and by Replica Analytics Ltd.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>This work was performed in collaboration with Replica Analytics Ltd. This company is a spin-off from the Children’s Hospital of Eastern Ontario Research Institute. KEE is co-founder and has equity in this company. LM and XF are data scientists employed by Replica Analytics Ltd.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>New approaches to data dissemination: a glimpse into the future (?)</article-title>
          <source>CHANCE</source>
          <year>2012</year>
          <month>09</month>
          <day>20</day>
          <volume>17</volume>
          <issue>3</issue>
          <fpage>11</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1080/09332480.2004.10554907</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mohammadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gorde</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jajodia</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Data synthesis based on generative adversarial networks</article-title>
          <source>Proc VLDB Endow</source>
          <year>2018</year>
          <month>06</month>
          <day>01</day>
          <volume>11</volume>
          <issue>10</issue>
          <fpage>1071</fpage>
          <lpage>1083</lpage>
          <pub-id pub-id-type="doi">10.14778/3231751.3231757</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Bayesian Estimation of Attribute and Identification Disclosure Risks in Synthetic Data</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <access-date>2022-03-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1804.02784">http://arxiv.org/abs/1804.02784</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taub</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Elliot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pampaka</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Differential correct attribution probability for synthetic data: an exploration</article-title>
          <source>Privacy in Statistical Databases</source>
          <year>2018</year>
          <publisher-loc>Switzerland</publisher-loc>
          <publisher-name>Springer, Cham</publisher-name>
          <fpage>122</fpage>
          <lpage>137</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Disclosure risk evaluation for fully synthetic categorical data</article-title>
          <source>Privacy in Statistical Databases</source>
          <year>2014</year>
          <publisher-loc>Switzerland</publisher-loc>
          <publisher-name>Springer, Cham</publisher-name>
          <fpage>185</fpage>
          <lpage>199</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Releasing synthetic magnitude microdata constrained to fixed marginal totals</article-title>
          <source>SJI</source>
          <year>2016</year>
          <month>02</month>
          <day>27</day>
          <volume>32</volume>
          <issue>1</issue>
          <fpage>93</fpage>
          <lpage>108</lpage>
          <pub-id pub-id-type="doi">10.3233/sji-160959</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ruiz</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Muralidhar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>On the privacy guarantees of synthetic data: a reassessment from the maximum-knowledge attacker perspective</article-title>
          <source>Privacy in Statistical Databases</source>
          <year>2018</year>
          <publisher-loc>Switzerland</publisher-loc>
          <publisher-name>Springer, Cham</publisher-name>
          <fpage>59</fpage>
          <lpage>74</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Releasing multiply imputed, synthetic public use microdata: an illustration and empirical study</article-title>
          <source>J Royal Statistical Soc A</source>
          <year>2005</year>
          <month>01</month>
          <volume>168</volume>
          <issue>1</issue>
          <fpage>185</fpage>
          <lpage>205</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1467-985x.2004.00343.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bass</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Evaluating identity disclosure risk in fully synthetic health data: model development and validation</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>11</month>
          <day>16</day>
          <volume>22</volume>
          <issue>11</issue>
          <fpage>e23139</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/11/e23139/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/23139</pub-id>
          <pub-id pub-id-type="medline">33196453</pub-id>
          <pub-id pub-id-type="pii">v22i11e23139</pub-id>
          <pub-id pub-id-type="pmcid">PMC7704280</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hoptroff</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>Practical Synthetic Data Generation</source>
          <year>2020</year>
          <publisher-loc>Sebastopol, CA</publisher-loc>
          <publisher-name>O'Reilly Media, Inc</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Karr</surname>
              <given-names>AF</given-names>
            </name>
            <name name-style="western">
              <surname>Kohnen</surname>
              <given-names>CN</given-names>
            </name>
            <name name-style="western">
              <surname>Oganian</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Sanil</surname>
              <given-names>AP</given-names>
            </name>
          </person-group>
          <article-title>A framework for evaluating the utility of data altered to protect confidentiality</article-title>
          <source>Am Stat</source>
          <year>2006</year>
          <month>08</month>
          <volume>60</volume>
          <issue>3</issue>
          <fpage>224</fpage>
          <lpage>232</lpage>
          <pub-id pub-id-type="doi">10.1198/000313006x124640</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pilote</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <collab>GOING-FWD Collaborators</collab>
          </person-group>
          <article-title>Can synthetic data be a proxy for real clinical trial data? A validation study</article-title>
          <source>BMJ Open</source>
          <year>2021</year>
          <month>04</month>
          <day>16</day>
          <volume>11</volume>
          <issue>4</issue>
          <fpage>e043497</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmjopen.bmj.com/lookup/pmidlookup?view=long&#38;pmid=33863713"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2020-043497</pub-id>
          <pub-id pub-id-type="medline">33863713</pub-id>
          <pub-id pub-id-type="pii">bmjopen-2020-043497</pub-id>
          <pub-id pub-id-type="pmcid">PMC8055130</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jonker</surname>
              <given-names>Elizabeth</given-names>
            </name>
            <name name-style="western">
              <surname>Sood</surname>
              <given-names>Harpreet</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the utility of synthetic COVID-19 case data</article-title>
          <source>JAMIA Open</source>
          <year>2021</year>
          <month>01</month>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>ooab012</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/33709065"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooab012</pub-id>
          <pub-id pub-id-type="medline">33709065</pub-id>
          <pub-id pub-id-type="pii">ooab012</pub-id>
          <pub-id pub-id-type="pmcid">PMC7936723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiner Benaim</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Almog</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gorelik</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hochberg</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Nassar</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mashiach</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Khamaisi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lurie</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Azzam</surname>
              <given-names>ZS</given-names>
            </name>
            <name name-style="western">
              <surname>Khoury</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kurnik</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Beyar</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Analyzing medical research results based on synthetic data and their relation to real data results: systematic comparison from five observational studies</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>02</month>
          <day>20</day>
          <volume>8</volume>
          <issue>2</issue>
          <fpage>e16492</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/2/e16492/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/16492</pub-id>
          <pub-id pub-id-type="medline">32130148</pub-id>
          <pub-id pub-id-type="pii">v8i2e16492</pub-id>
          <pub-id pub-id-type="pmcid">PMC7059086</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rankin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Black</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bond</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mulvenna</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Epelde</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Reliability of supervised machine learning using synthetic data in health care: model to preserve privacy for data sharing</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>07</month>
          <day>20</day>
          <volume>8</volume>
          <issue>7</issue>
          <fpage>e18910</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/7/e18910/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/18910</pub-id>
          <pub-id pub-id-type="medline">32501278</pub-id>
          <pub-id pub-id-type="pii">v8i7e18910</pub-id>
          <pub-id pub-id-type="pmcid">PMC7400044</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Foraker</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michelson</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Pineda Soto</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Colvin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Loh</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kollef</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Maddox</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Evanoff</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dror</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zamstein</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Payne</surname>
              <given-names>PRO</given-names>
            </name>
          </person-group>
          <article-title>Spot the difference: comparing results of analyses from real patient data and synthetic derivatives</article-title>
          <source>JAMIA Open</source>
          <year>2020</year>
          <month>12</month>
          <volume>3</volume>
          <issue>4</issue>
          <fpage>557</fpage>
          <lpage>566</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/33623891"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooaa060</pub-id>
          <pub-id pub-id-type="medline">33623891</pub-id>
          <pub-id pub-id-type="pii">ooaa060</pub-id>
          <pub-id pub-id-type="pmcid">PMC7886551</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goncalves</surname>
              <given-names>Andre</given-names>
            </name>
            <name name-style="western">
              <surname>Ray</surname>
              <given-names>Priyadip</given-names>
            </name>
            <name name-style="western">
              <surname>Soper</surname>
              <given-names>Braden</given-names>
            </name>
            <name name-style="western">
              <surname>Stevens</surname>
              <given-names>Jennifer</given-names>
            </name>
            <name name-style="western">
              <surname>Coyle</surname>
              <given-names>Linda</given-names>
            </name>
            <name name-style="western">
              <surname>Sales</surname>
              <given-names>Ana Paula</given-names>
            </name>
          </person-group>
          <article-title>Generation and evaluation of synthetic patient data</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2020</year>
          <month>05</month>
          <day>07</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>108</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-020-00977-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12874-020-00977-1</pub-id>
          <pub-id pub-id-type="medline">32381039</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12874-020-00977-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC7204018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Platzer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Reutterer</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data</article-title>
          <source>arXiv</source>
          <year>2021</year>
          <month>04</month>
          <day>01</day>
          <access-date>2022-10-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2104.00635">http://arxiv.org/abs/2104.00635</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Emam</surname>
              <given-names>Khaled El</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>Lucy</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Chaoyi</given-names>
            </name>
          </person-group>
          <article-title>Optimizing the synthesis of clinical trial data using sequential trees</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2021</year>
          <month>01</month>
          <day>15</day>
          <volume>28</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/33186440"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa249</pub-id>
          <pub-id pub-id-type="medline">33186440</pub-id>
          <pub-id pub-id-type="pii">5981525</pub-id>
          <pub-id pub-id-type="pmcid">PMC7810457</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>An empirical study on evaluation metrics of generative adversarial networks</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <access-date>2022-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1806.07755">http://arxiv.org/abs/1806.07755</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Woo</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Oganian</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Karr</surname>
              <given-names>AF</given-names>
            </name>
          </person-group>
          <article-title>Global measures of data utility for microdata masked for disclosure limitation</article-title>
          <source>JPC</source>
          <year>2009</year>
          <month>04</month>
          <day>01</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>111</fpage>
          <lpage>124</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.29012/jpc.v1i1.568"/>
          </comment>
          <pub-id pub-id-type="doi">10.29012/jpc.v1i1.568</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Snoke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dibben</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Slavkovic</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>General and specific utility measures for synthetic data</article-title>
          <source>J R Stat Soc A</source>
          <year>2018</year>
          <month>03</month>
          <day>07</day>
          <volume>181</volume>
          <issue>3</issue>
          <fpage>663</fpage>
          <lpage>688</lpage>
          <pub-id pub-id-type="doi">10.1111/rssa.12358</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dankar</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>Ibrahim</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Fake it till you make it: guidelines for effective synthetic data generation</article-title>
          <source>Appl Sci</source>
          <year>2021</year>
          <month>02</month>
          <day>28</day>
          <volume>11</volume>
          <issue>5</issue>
          <fpage>2158</fpage>
          <pub-id pub-id-type="doi">10.3390/app11052158</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cha</surname>
              <given-names>SH</given-names>
            </name>
          </person-group>
          <article-title>Comprehensive survey on distance similarity measures between probability density functions</article-title>
          <source>Math Models Methods Appl Sci</source>
          <year>2007</year>
          <volume>4</volume>
          <fpage>300</fpage>
          <lpage>307</lpage>
          <pub-id pub-id-type="doi">10.46300/9101</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gretton</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Borgwardt</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Rasch</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schölkopf</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Smola</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A Kernel Method for the Two-Sample Problem</article-title>
          <source>Advances in Neural Information Processing Systems 19: Proceedings of the 2006 Conference</source>
          <year>2007</year>
          <conf-name>20th Annual Conference on Neural Information Processing Systems: NIPS 200</conf-name>
          <conf-date>December 4-7, 2006</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
          <publisher-loc>Cambridge, MA</publisher-loc>
          <publisher-name>MIT Press</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper/2006/file/e9fb2eda3d9c55a0d89c98d6c54b5b3e-Paper.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tucker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Rotalinti</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Myles</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Generating high-fidelity synthetic patient data for assessing machine learning healthcare software</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <month>11</month>
          <day>09</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>147</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-020-00353-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-020-00353-9</pub-id>
          <pub-id pub-id-type="medline">33299100</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-020-00353-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC7653933</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Torfi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fox</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>CK</given-names>
            </name>
          </person-group>
          <article-title>Differentially Private Synthetic Medical Data Generation using Convolutional GANs</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <access-date>2022-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2012.11774">http://arxiv.org/abs/2012.11774</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cristóbal</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Stephanie</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Real-valued (Medical) Time Series Generation with Recurrent Conditional GANs</article-title>
          <source>arXiv</source>
          <year>2017</year>
          <access-date>2021-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1706.02633">https://arxiv.org/abs/1706.02633</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kuppannagari</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Kannan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Prasanna</surname>
              <given-names>VK</given-names>
            </name>
          </person-group>
          <article-title>Generative Adversarial Network for Synthetic Time Series Data Generation in Smart Grids</article-title>
          <year>2018</year>
          <conf-name>2018 IEEE International Conference on Communications, Control, and Computing Technologies for Smart Grids (SmartGridComm)</conf-name>
          <conf-date>October 29-31, 2018</conf-date>
          <conf-loc>Aalborg, Denmark</conf-loc>
          <pub-id pub-id-type="doi">10.1109/SmartGridComm.2018.8587464</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Le Cam</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>GL</given-names>
            </name>
          </person-group>
          <source>Asymptotics in Statistics: Some Basic Concepts</source>
          <year>2000</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gomatam</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Karr</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sanil</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Data swapping as a decision problem</article-title>
          <source>J Off Stat</source>
          <year>2005</year>
          <volume>21</volume>
          <issue>4</issue>
          <fpage>635</fpage>
          <lpage>655</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cs.yorku.ca/~kosta/CompVis_Notes/bhattacharyya.pdf#:~:text=The%20Bhattacharyya%20measure%20%28Bhattacharyya%2C%201943%29%20%28or%20coe%EF%AC%83cient%29%20is,%28p%20p%281%29%2C...%2C%20p%20p%28N%29%29%3Eand%20%28p%20p0%281%29%2C...%2C%20p%20p0%28N%29%29%3E.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Derpanis</surname>
              <given-names>KG</given-names>
            </name>
          </person-group>
          <article-title>The Bhattacharyya Measure</article-title>
          <source>CiteSeerX</source>
          <year>2008</year>
          <access-date>2021-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.217.3369">http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.217.3369</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Joe</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <source>Dependence Modeling with Copulas</source>
          <year>2015</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Chapman and Hall/CRC</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borji</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Pros and Cons of GAN Evaluation Measures</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <access-date>2020-05-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1802.03446">http://arxiv.org/abs/1802.03446</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kantorovich</surname>
              <given-names>LV</given-names>
            </name>
          </person-group>
          <article-title>Mathematical Methods of Organizing and Planning Production</article-title>
          <source>Management Science</source>
          <year>1960</year>
          <month>07</month>
          <volume>6</volume>
          <issue>4</issue>
          <fpage>366</fpage>
          <lpage>422</lpage>
          <pub-id pub-id-type="doi">10.1287/mnsc.6.4.366</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arjovsky</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chintala</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bottou</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Wasserstein Generative Adversarial Networks</article-title>
          <year>2017</year>
          <conf-name>The 34th International Conference on Machine Learning</conf-name>
          <conf-date>August 6-11, 2017</conf-date>
          <conf-loc>Sydney, Australia</conf-loc>
          <fpage>214</fpage>
          <lpage>223</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Szpruch</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wiese</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Conditional Sig-Wasserstein GANs for Time Series Generation</article-title>
          <source>SSRN</source>
          <year>2020</year>
          <access-date>2021-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3623086">https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3623086</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>On Multivariate Goodness-of-Fit and Two-Sample Testing</article-title>
          <year>2003</year>
          <conf-name>PHYSTAT2003</conf-name>
          <conf-date>September 8-11, 2003</conf-date>
          <conf-loc>Stanford, California</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hediger</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Näf</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>On the Use of Random Forest for Two-Sample Testing</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <access-date>2020-05-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1903.06287">http://arxiv.org/abs/1903.06287</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosenbaum</surname>
              <given-names>Pr</given-names>
            </name>
            <name name-style="western">
              <surname>Rubin</surname>
              <given-names>Db</given-names>
            </name>
          </person-group>
          <article-title>The central role of the propensity score in observational studies for causal effects</article-title>
          <source>Biometrika</source>
          <year>1983</year>
          <volume>70</volume>
          <issue>1</issue>
          <fpage>41</fpage>
          <lpage>55</lpage>
          <pub-id pub-id-type="doi">10.1093/biomet/70.1.41</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beaulieu-Jones</surname>
              <given-names>BK</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>ZS</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bhavnani</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Byrd</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Greene</surname>
              <given-names>CS</given-names>
            </name>
          </person-group>
          <article-title>Privacy-Preserving Generative Deep Neural Networks Support Clinical Data Sharing</article-title>
          <source>Circ Cardiovasc Qual Outcomes</source>
          <year>2019</year>
          <month>07</month>
          <volume>12</volume>
          <issue>7</issue>
          <fpage>e005122</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ahajournals.org/doi/abs/10.1161/CIRCOUTCOMES.118.005122?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1161/CIRCOUTCOMES.118.005122</pub-id>
          <pub-id pub-id-type="medline">31284738</pub-id>
          <pub-id pub-id-type="pmcid">PMC7041894</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Biswal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Duke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Generating Multi-label Discrete Patient Records using Generative Adversarial Networks</article-title>
          <year>2017</year>
          <conf-name>Machine Learning for Healthcare Conference</conf-name>
          <conf-date>August 18-19, 2017</conf-date>
          <conf-loc>Boston</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://proceedings.mlr.press/v68/choi17a/choi17a.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salim</surname>
              <given-names>Jr</given-names>
            </name>
          </person-group>
          <article-title>Synthetic Patient Generation: A Deep Learning Approach Using Variational Autoencoders</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <access-date>2021-08-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1808.06444">http://arxiv.org/abs/1808.06444</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Christodoulou</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Verbakel</surname>
              <given-names>JY</given-names>
            </name>
            <name name-style="western">
              <surname>Van Calster</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A systematic review shows no performance benefit of machine learning over logistic regression for clinical prediction models</article-title>
          <source>J Clin Epidemiol</source>
          <year>2019</year>
          <month>06</month>
          <volume>110</volume>
          <fpage>12</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jclinepi.2019.02.004</pub-id>
          <pub-id pub-id-type="medline">30763612</pub-id>
          <pub-id pub-id-type="pii">S0895-4356(18)31081-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pepe</surname>
              <given-names>MS</given-names>
            </name>
          </person-group>
          <source>The Statistical Evaluation of Medical Tests for Classification and Prediction</source>
          <year>2004</year>
          <publisher-loc>Oxford</publisher-loc>
          <publisher-name>Oxford University Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Goadrich</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The relationship between Precision-Recall and ROC curves</article-title>
          <year>2006</year>
          <conf-name>23rd International Conference on Machine Learning (ICML '06)</conf-name>
          <conf-date>June 25-29, 2006</conf-date>
          <conf-loc>Pittsburgh</conf-loc>
          <pub-id pub-id-type="doi">10.1145/1143844.1143874</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hand</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Till</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>A simple generalisation of the area under the ROC curve for multiple class classification problems</article-title>
          <source>Mach Learn</source>
          <year>2001</year>
          <volume>45</volume>
          <issue>2</issue>
          <fpage>171</fpage>
          <lpage>186</lpage>
          <pub-id pub-id-type="doi">10.1023/A:1010920819831</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Skoularidou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cuesta-Infante</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Veeramachaneni</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Modeling Tabular data using Conditional GAN</article-title>
          <year>2019</year>
          <conf-name>Advances in Neural Information Processing Systems 32 (NeurIPS 2019)</conf-name>
          <conf-date>December 8-14, 2019</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
          <fpage>11</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper/2019/hash/254ed7d2de3b23ab10936522dd547b78-Abstract.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ping</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Stoyanovich</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Howe</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>DataSynthesizer: Privacy-Preserving Synthetic Datasets</article-title>
          <year>2017</year>
          <conf-name>The 29th International Conference on Scientific and Statistical Database Management</conf-name>
          <conf-date>June 27-29, 2017</conf-date>
          <conf-loc>Chicago, IL</conf-loc>
          <fpage>1</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1145/3085504.3091117"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3085504.3091117</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Drechsler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>An empirical evaluation of easily implemented, nonparametric methods for generating synthetic datasets</article-title>
          <source>Comput Stat Data Anal</source>
          <year>2011</year>
          <month>12</month>
          <volume>55</volume>
          <issue>12</issue>
          <fpage>3232</fpage>
          <lpage>3243</lpage>
          <pub-id pub-id-type="doi">10.1016/j.csda.2011.06.006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arslan</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Schilling</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Gerlach</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Penke</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Using 26,000 diary entries to show ovulatory changes in sexual desire and behavior</article-title>
          <source>J Pers Soc Psychol</source>
          <year>2021</year>
          <month>08</month>
          <volume>121</volume>
          <issue>2</issue>
          <fpage>410</fpage>
          <lpage>431</lpage>
          <pub-id pub-id-type="doi">10.1037/pspp0000208</pub-id>
          <pub-id pub-id-type="medline">30148371</pub-id>
          <pub-id pub-id-type="pii">2018-41799-001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bonnéry</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Henneberger</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Lachowicz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rose</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Stapleton</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Woolley</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>The promise and limitations of synthetic data as a strategy to expand access to state-level multi-agency longitudinal data</article-title>
          <source>J Res Educ Eff</source>
          <year>2019</year>
          <month>08</month>
          <day>02</day>
          <volume>12</volume>
          <issue>4</issue>
          <fpage>616</fpage>
          <lpage>647</lpage>
          <pub-id pub-id-type="doi">10.1080/19345747.2019.1631421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sabay</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bejugama</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Jaceldo-Siegl</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Overcoming small data limitations in heart disease prediction by using surrogate data</article-title>
          <source>SMU Data Science Review</source>
          <year>2018</year>
          <volume>1</volume>
          <issue>3</issue>
          <fpage>12</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://scholar.smu.edu/datasciencereview/vol1/iss3/12"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Freiman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lauger</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Data Synthesis and Perturbation for the American Community Survey at the U.S. Census Bureau</article-title>
          <source>United States Census Bureau</source>
          <year>2017</year>
          <access-date>2021-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.census.gov/content/dam/Census/library/working-papers/2018/adrm/2017%20Data%20Synthesis%20and%20Perturbation%20for%20ACS.pdf">https://www.census.gov/content/dam/Census/library/working-papers/2018/adrm/2017%20Data%20Synthesis%20and%20Perturbation%20for%20ACS.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Utility of synthetic microdata generated using tree-based methods</article-title>
          <year>2015</year>
          <conf-name>UNECE Statistical Data Confidentiality Work Session</conf-name>
          <conf-date>October 5-7, 2015</conf-date>
          <conf-loc>Helsinki, Finland</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://unece.org/statistics/events/SDC2015"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/springerreference_64338</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dibben</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Practical data synthesis for large samples</article-title>
          <source>JPC</source>
          <year>2018</year>
          <month>02</month>
          <day>02</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>67</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.29012/jpc.v7i3.407</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Dibben</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Providing bespoke synthetic data for the UK Longitudinal Studies and other sensitive data with the synthpop package for R</article-title>
          <source>SJI</source>
          <year>2017</year>
          <month>08</month>
          <day>21</day>
          <volume>33</volume>
          <issue>3</issue>
          <fpage>785</fpage>
          <lpage>796</lpage>
          <pub-id pub-id-type="doi">10.3233/sji-150153</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quintana</surname>
              <given-names>DS</given-names>
            </name>
          </person-group>
          <article-title>A synthetic dataset primer for the biobehavioural sciences to promote reproducibility and hypothesis generation</article-title>
          <source>eLife</source>
          <year>2020</year>
          <month>03</month>
          <day>11</day>
          <access-date>2020-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://elifesciences.org/articles/53275">https://elifesciences.org/articles/53275</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Myles</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tucker</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Generating and Evaluating Synthetic UK Primary Care Data: Preserving Data Utility Patient Privacy</article-title>
          <year>2019</year>
          <conf-name>IEEE 32nd International Symposium on Computer-Based Medical Systems (CBMS)</conf-name>
          <conf-date>June 5-7, 2019</conf-date>
          <conf-loc>Cordoba, Spain</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/8787436"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/cbms.2019.00036</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chin-Cheong</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sutter</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Vogt</surname>
              <given-names>JE</given-names>
            </name>
          </person-group>
          <article-title>Generation of Heterogeneous Synthetic Electronic Health Records using GANs</article-title>
          <year>2019</year>
          <conf-name>Workshop on Machine Learning for Health (ML4H) at the 33rd Conference on Neural Information Processing Systems (NeurIPS 2019)</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.research-collection.ethz.ch/handle/20.500.11850/392473"/>
          </comment>
          <pub-id pub-id-type="doi">10.3929/ethz-b-000392473</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mesa</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>Ensuring electronic medical record simulation through better training, modeling, and evaluation</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>01</month>
          <day>01</day>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>99</fpage>
          <lpage>108</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31592533"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz161</pub-id>
          <pub-id pub-id-type="medline">31592533</pub-id>
          <pub-id pub-id-type="pii">5583723</pub-id>
          <pub-id pub-id-type="pmcid">PMC6913223</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Castellan</surname>
              <given-names>NJ</given-names>
            </name>
          </person-group>
          <source>Nonparametric statistics for the behavioral sciences, 2nd ed</source>
          <year>1988</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Mcgraw-Hill Book Company</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pihur</surname>
              <given-names>Vasyl</given-names>
            </name>
            <name name-style="western">
              <surname>Datta</surname>
              <given-names>Susmita</given-names>
            </name>
            <name name-style="western">
              <surname>Datta</surname>
              <given-names>Somnath</given-names>
            </name>
          </person-group>
          <article-title>Weighted rank aggregation of cluster validation measures: a Monte Carlo cross-entropy approach</article-title>
          <source>Bioinformatics</source>
          <year>2007</year>
          <month>07</month>
          <day>01</day>
          <volume>23</volume>
          <issue>13</issue>
          <fpage>1607</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btm158</pub-id>
          <pub-id pub-id-type="medline">17483500</pub-id>
          <pub-id pub-id-type="pii">btm158</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fritz</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>GAN-Leaks: A Taxonomy of Membership Inference Attacks against Generative Models</article-title>
          <year>2020</year>
          <conf-name>ACM SIGSAC Conference on Computer and Communications Security</conf-name>
          <conf-date>November 9-13, 2020</conf-date>
          <conf-loc>USA Virtual</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1145/3372297.3417238"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3372297.3417238</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hilprecht</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Härterich</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bernau</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Monte Carlo and Reconstruction Membership Inference Attacks against Generative Models</article-title>
          <source>Proc Priv Enh Technol</source>
          <year>2019</year>
          <volume>4</volume>
          <fpage>232</fpage>
          <lpage>249</lpage>
          <pub-id pub-id-type="doi">10.2478/popets-2019-0067</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
